Implement BMad Must-Have Before Launch fixes for all 6 products
P1: API key redaction, SSE billing leak, token math edge cases, CI runner config P2: mTLS revocation lockout, terraform state lock recovery, RLS pool leak, entropy scrubber, pgmq visibility P3: HMAC replay prevention, cross-tenant negative tests, correlation window edge cases, SQS claim-check, free tier P4: Discovery partial failure recovery, ownership conflict integration test, VCR freshness CI, Meilisearch rebuild, Cmd+K latency P5: Concurrent baseline conflicts, remediation RBAC, Clock interface for governance, 10K property-based runs, Redis panic fallback P6: Cryptographic agent update signatures, streaming audit logs with WAL, shell AST parsing (mvdan/sh), intervention deadlock TTL, canary suite CI gate
This commit is contained in:
@@ -1265,3 +1265,225 @@ def test_meilisearch_index_rebuild_does_not_drop_search():
|
||||
# Verify zero-downtime index swapping during mapping updates
|
||||
pass
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 12. BMad Review Implementation (Must-Have Before Launch)
|
||||
|
||||
### 12.1 Discovery Scan Timeout / Partial Failure Recovery
|
||||
|
||||
```python
|
||||
# tests/integration/test_discovery_resilience.py
|
||||
|
||||
def test_partial_aws_scan_does_not_delete_existing_services():
|
||||
"""If AWS scanner times out after discovering 500 of 1000 resources,
|
||||
existing catalog entries must NOT be marked stale or deleted."""
|
||||
|
||||
# Seed catalog with 1000 services
|
||||
seed_catalog(count=1000)
|
||||
|
||||
# Simulate scanner timeout after 500 resources
|
||||
with mock_aws_timeout_after(500):
|
||||
result = run_aws_discovery_scan()
|
||||
|
||||
assert result.status == "partial_failure"
|
||||
assert result.discovered == 500
|
||||
|
||||
# All 1000 services must still exist in catalog
|
||||
services = catalog_api.list_services()
|
||||
assert len(services) == 1000 # NOT 500
|
||||
|
||||
# Partial results should be staged, not committed
|
||||
staged = catalog_api.list_staged_updates()
|
||||
assert len(staged) == 500
|
||||
|
||||
def test_partial_github_scan_does_not_corrupt_ownership():
|
||||
"""If GitHub scanner hits rate limit mid-scan, existing ownership
|
||||
mappings must be preserved."""
|
||||
|
||||
seed_catalog_with_ownership(count=100)
|
||||
|
||||
with mock_github_rate_limit_after(50):
|
||||
result = run_github_discovery_scan()
|
||||
|
||||
assert result.status == "partial_failure"
|
||||
|
||||
# All 100 ownership mappings intact
|
||||
services = catalog_api.list_services()
|
||||
owned = [s for s in services if s.owner is not None]
|
||||
assert len(owned) == 100 # NOT 50
|
||||
|
||||
def test_scan_failure_triggers_alert_not_silent_failure():
|
||||
result = run_aws_discovery_scan_with_invalid_credentials()
|
||||
assert result.status == "failed"
|
||||
|
||||
# Must alert the admin
|
||||
alerts = get_admin_alerts()
|
||||
assert any("discovery scan failed" in a.message for a in alerts)
|
||||
```
|
||||
|
||||
### 12.2 Ownership Conflict Resolution Integration Test
|
||||
|
||||
```python
|
||||
# tests/integration/test_ownership_conflict.py
|
||||
|
||||
def test_explicit_config_overrides_implicit_tag():
|
||||
"""Explicit (CODEOWNERS/config) > Implicit (AWS tags) > Heuristic (commits)"""
|
||||
|
||||
# AWS tag says owner is "team-infra"
|
||||
aws_scanner.discover_service("auth-api", owner_tag="team-infra")
|
||||
|
||||
# GitHub CODEOWNERS says owner is "team-platform"
|
||||
github_scanner.discover_service("auth-api", codeowners="team-platform")
|
||||
|
||||
# Resolve conflict
|
||||
service = catalog_api.get_service("auth-api")
|
||||
assert service.owner == "team-platform" # Explicit wins
|
||||
assert service.owner_source == "codeowners"
|
||||
|
||||
def test_concurrent_discovery_sources_do_not_race():
|
||||
"""Two scanners discovering the same service simultaneously
|
||||
must not create duplicate entries."""
|
||||
|
||||
import asyncio
|
||||
|
||||
async def run_both():
|
||||
await asyncio.gather(
|
||||
aws_scanner.discover_service_async("billing-api"),
|
||||
github_scanner.discover_service_async("billing-api"),
|
||||
)
|
||||
|
||||
asyncio.run(run_both())
|
||||
|
||||
services = catalog_api.search("billing-api")
|
||||
assert len(services) == 1 # No duplicates
|
||||
|
||||
def test_heuristic_ownership_does_not_override_explicit():
|
||||
# Explicit owner set via config
|
||||
catalog_api.set_owner("auth-api", "team-platform", source="config")
|
||||
|
||||
# Heuristic scanner infers different owner from commit history
|
||||
github_scanner.infer_ownership("auth-api", top_committer="dev@other-team.com")
|
||||
|
||||
service = catalog_api.get_service("auth-api")
|
||||
assert service.owner == "team-platform" # Explicit preserved
|
||||
```
|
||||
|
||||
### 12.3 VCR Cassette Freshness Validation
|
||||
|
||||
```yaml
|
||||
# .github/workflows/vcr-refresh.yml
|
||||
# Weekly job to re-record VCR cassettes against real AWS
|
||||
name: VCR Cassette Freshness
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 6 * * 1' # Every Monday 6 AM UTC
|
||||
|
||||
jobs:
|
||||
refresh:
|
||||
runs-on: self-hosted
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Re-record cassettes
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.VCR_AWS_KEY }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.VCR_AWS_SECRET }}
|
||||
run: |
|
||||
VCR_RECORD=all pytest tests/integration/scanners/ -v
|
||||
- name: Diff cassettes
|
||||
run: |
|
||||
git diff --stat tests/cassettes/
|
||||
CHANGED=$(git diff --name-only tests/cassettes/ | wc -l)
|
||||
if [ "$CHANGED" -gt 0 ]; then
|
||||
echo "⚠️ $CHANGED cassettes changed — AWS API responses have drifted"
|
||||
echo "Review and commit updated cassettes"
|
||||
fi
|
||||
- name: Create PR if cassettes changed
|
||||
uses: peter-evans/create-pull-request@v6
|
||||
with:
|
||||
title: "chore: refresh VCR cassettes (AWS API drift)"
|
||||
branch: vcr-refresh
|
||||
```
|
||||
|
||||
### 12.4 Meilisearch Zero-Downtime Index Rebuild
|
||||
|
||||
```python
|
||||
# tests/integration/test_meilisearch_rebuild.py
|
||||
|
||||
def test_search_returns_results_during_index_rebuild():
|
||||
"""Cmd+K search must work during index rebuild (zero downtime)."""
|
||||
|
||||
# Seed index with 100 services
|
||||
meili.index("services").add_documents(make_services(100))
|
||||
meili.index("services").wait_for_pending_update()
|
||||
|
||||
# Start rebuild (creates services_v2, swaps when ready)
|
||||
rebuild_task = start_index_rebuild()
|
||||
|
||||
# Search must still work during rebuild
|
||||
results = meili.index("services").search("auth")
|
||||
assert len(results["hits"]) > 0
|
||||
|
||||
# Wait for rebuild to complete
|
||||
rebuild_task.wait()
|
||||
|
||||
# Search still works after swap
|
||||
results = meili.index("services").search("auth")
|
||||
assert len(results["hits"]) > 0
|
||||
|
||||
def test_index_rebuild_failure_does_not_corrupt_active_index():
|
||||
meili.index("services").add_documents(make_services(50))
|
||||
|
||||
# Simulate rebuild failure (e.g., OOM during indexing)
|
||||
with mock_meili_oom_during_rebuild():
|
||||
result = start_index_rebuild()
|
||||
|
||||
assert result.status == "failed"
|
||||
|
||||
# Active index must be untouched
|
||||
results = meili.index("services").search("billing")
|
||||
assert len(results["hits"]) > 0 # Still works
|
||||
```
|
||||
|
||||
### 12.5 Cmd+K Search Latency from Redis Cache
|
||||
|
||||
```python
|
||||
# tests/performance/test_search_latency.py
|
||||
|
||||
def test_cmd_k_search_under_10ms_from_redis():
|
||||
"""Prefix cache hit must return in <10ms."""
|
||||
|
||||
# Warm the cache
|
||||
redis_client.set("search:prefix:auth", json.dumps([
|
||||
{"name": "auth-service", "owner": "team-platform"},
|
||||
{"name": "auth-proxy", "owner": "team-infra"},
|
||||
]))
|
||||
|
||||
import time
|
||||
start = time.perf_counter_ns()
|
||||
results = search_api.prefix_search("auth")
|
||||
elapsed_ms = (time.perf_counter_ns() - start) / 1_000_000
|
||||
|
||||
assert elapsed_ms < 10, f"Cmd+K search took {elapsed_ms:.1f}ms — exceeds 10ms SLA"
|
||||
assert len(results) == 2
|
||||
```
|
||||
|
||||
### 12.6 Free Tier Enforcement (50 Services)
|
||||
|
||||
```python
|
||||
def test_free_tier_allows_50_services():
|
||||
tenant = create_tenant(tier="free")
|
||||
for i in range(50):
|
||||
resp = catalog_api.create_service(tenant, f"service-{i}")
|
||||
assert resp.status_code == 201
|
||||
|
||||
def test_free_tier_rejects_51st_service():
|
||||
tenant = create_tenant(tier="free")
|
||||
seed_services(tenant, count=50)
|
||||
|
||||
resp = catalog_api.create_service(tenant, "service-51")
|
||||
assert resp.status_code == 403
|
||||
assert "upgrade" in resp.json()["error"].lower()
|
||||
```
|
||||
|
||||
*End of P4 BMad Implementation*
|
||||
|
||||
Reference in New Issue
Block a user