Implement BMad Must-Have Before Launch fixes for all 6 products

P1: API key redaction, SSE billing leak, token math edge cases, CI runner config P2: mTLS revocation lockout, terraform state lock recovery, RLS pool leak, entropy scrubber, pgmq visibility P3: HMAC replay prevention, cross-tenant negative tests, correlation window edge cases, SQS claim-check, free tier P4: Discovery partial failure recovery, ownership conflict integration test, VCR freshness CI, Meilisearch rebuild, Cmd+K latency P5: Concurrent baseline conflicts, remediation RBAC, Clock interface for governance, 10K property-based runs, Redis panic fallback P6: Cryptographic agent update signatures, streaming audit logs with WAL, shell AST parsing (mvdan/sh), intervention deadlock TTL, canary suite CI gate
2026-03-01 02:14:04 +00:00
parent b24cfa7c0d
commit d038cd9c5c
6 changed files with 1305 additions and 0 deletions
--- a/products/05-aws-cost-anomaly/test-architecture/test-architecture.md
+++ b/products/05-aws-cost-anomaly/test-architecture/test-architecture.md
@@ -710,3 +710,198 @@ export const makeBaseline = (overrides) => ({
   - CDK definitions, LocalStack event injection, wire everything together.

 *End of dd0c/cost Test Architecture (v2)*
+
+---
+
+## 12. BMad Review Implementation (Must-Have Before Launch)
+
+### 12.1 Concurrent Baseline Update Conflict Test
+
+```typescript
+describe('Concurrent Baseline Updates (DynamoDB TransactWriteItem)', () => {
+  it('two simultaneous Lambda invocations converge to correct baseline', async () => {
+    // Seed baseline: mean=1.00, stddev=0.10, count=20
+    await seedBaseline('tenant-1', 'ec2/m5.xlarge', { mean: 1.00, stddev: 0.10, count: 20 });
+    
+    // Two events arrive simultaneously for the same resource type
+    const event1 = makeCostEvent({ hourlyCost: 1.50 });
+    const event2 = makeCostEvent({ hourlyCost: 2.00 });
+    
+    // Process concurrently
+    const [result1, result2] = await Promise.allSettled([
+      processEvent('tenant-1', 'ec2/m5.xlarge', event1),
+      processEvent('tenant-1', 'ec2/m5.xlarge', event2),
+    ]);
+    
+    // One succeeds, one retries via ConditionalCheckFailed
+    const successes = [result1, result2].filter(r => r.status === 'fulfilled');
+    expect(successes.length).toBe(2); // Both eventually succeed
+    
+    // Final baseline must reflect BOTH events
+    const baseline = await getBaseline('tenant-1', 'ec2/m5.xlarge');
+    expect(baseline.count).toBe(22); // 20 + 2
+    // Mean should be updated by both observations (order doesn't matter for Welford)
+  });
+
+  it('ConditionalCheckFailed triggers retry with fresh baseline read', async () => {
+    const spy = vi.spyOn(dynamoClient, 'transactWriteItems');
+    
+    // Force a conflict on first attempt
+    mockConflictOnce();
+    
+    await processEvent('tenant-1', 'ec2/m5.xlarge', makeCostEvent({ hourlyCost: 3.00 }));
+    
+    // Should have been called twice (initial + retry)
+    expect(spy).toHaveBeenCalledTimes(2);
+  });
+});
+```
+
+### 12.2 Remediation RBAC
+
+```typescript
+describe('Remediation Authorization', () => {
+  it('only account owners can click Stop Instance', async () => {
+    const ownerAction = makeSlackAction('stop_instance', { userId: 'U_OWNER' });
+    const resp = await handleSlackAction(ownerAction);
+    expect(resp.status).toBe(200);
+  });
+
+  it('viewer role cannot trigger remediation', async () => {
+    const viewerAction = makeSlackAction('stop_instance', { userId: 'U_VIEWER' });
+    const resp = await handleSlackAction(viewerAction);
+    expect(resp.status).toBe(403);
+    expect(resp.body.error).toContain('insufficient permissions');
+  });
+
+  it('user from different Slack workspace cannot trigger remediation', async () => {
+    const foreignAction = makeSlackAction('stop_instance', { 
+      userId: 'U_FOREIGN', 
+      teamId: 'T_OTHER_WORKSPACE' 
+    });
+    const resp = await handleSlackAction(foreignAction);
+    expect(resp.status).toBe(403);
+  });
+
+  it('snooze and mark-expected are allowed for all authenticated users', async () => {
+    const viewerSnooze = makeSlackAction('snooze_24h', { userId: 'U_VIEWER' });
+    const resp = await handleSlackAction(viewerSnooze);
+    expect(resp.status).toBe(200);
+  });
+});
+```
+
+### 12.3 Clock Interface for Governance Tests
+
+```typescript
+// src/governance/clock.ts
+interface Clock {
+  now(): number;
+  advanceBy(ms: number): void;
+}
+
+class FakeClock implements Clock {
+  private current: number;
+  constructor(start = Date.now()) { this.current = start; }
+  now() { return this.current; }
+  advanceBy(ms: number) { this.current += ms; }
+}
+
+describe('14-Day Auto-Promotion (Clock-Injected)', () => {
+  let clock: FakeClock;
+  let governance: GovernanceEngine;
+
+  beforeEach(() => {
+    clock = new FakeClock(new Date('2026-03-01').getTime());
+    governance = new GovernanceEngine(clock);
+  });
+
+  it('does not promote at day 13', () => {
+    clock.advanceBy(13 * 24 * 60 * 60 * 1000);
+    const result = governance.evaluatePromotion('tenant-1', { fpRate: 0.05 });
+    expect(result.promoted).toBe(false);
+  });
+
+  it('promotes at day 15 with low FP rate', () => {
+    clock.advanceBy(15 * 24 * 60 * 60 * 1000);
+    const result = governance.evaluatePromotion('tenant-1', { fpRate: 0.05 });
+    expect(result.promoted).toBe(true);
+    expect(result.newMode).toBe('audit');
+  });
+
+  it('does not promote at day 15 with high FP rate', () => {
+    clock.advanceBy(15 * 24 * 60 * 60 * 1000);
+    const result = governance.evaluatePromotion('tenant-1', { fpRate: 0.15 });
+    expect(result.promoted).toBe(false);
+    expect(result.reason).toContain('false-positive rate');
+  });
+});
+```
+
+### 12.4 Property-Based Tests with 10K Runs
+
+```typescript
+describe('Anomaly Scorer (fast-check, 10K runs)', () => {
+  it('score is always between 0 and 100', () => {
+    fc.assert(
+      fc.property(
+        fc.record({
+          cost: fc.float({ min: 0, max: 10000, noNaN: true }),
+          mean: fc.float({ min: 0, max: 10000, noNaN: true }),
+          stddev: fc.float({ min: 0, max: 1000, noNaN: true }),
+        }),
+        (input) => {
+          const score = scorer.score(input);
+          return score >= 0 && score <= 100;
+        }
+      ),
+      { numRuns: 10000, seed: 42 } // Reproducible
+    );
+  });
+
+  it('score monotonically increases as cost increases', () => {
+    fc.assert(
+      fc.property(
+        fc.float({ min: 0, max: 100, noNaN: true }),
+        fc.float({ min: 0, max: 100, noNaN: true }),
+        fc.float({ min: 0.01, max: 50, noNaN: true }),
+        (costA, costB, stddev) => {
+          const baseline = { mean: 5.0, stddev };
+          const scoreA = scorer.score({ cost: Math.min(costA, costB), ...baseline });
+          const scoreB = scorer.score({ cost: Math.max(costA, costB), ...baseline });
+          return scoreB >= scoreA;
+        }
+      ),
+      { numRuns: 10000, seed: 42 }
+    );
+  });
+});
+```
+
+### 12.5 Redis Failure During Panic Mode Check
+
+```typescript
+describe('Panic Mode Redis Failure', () => {
+  it('defaults to panic=active (safe) when Redis is unreachable', async () => {
+    // Kill Redis connection
+    await redis.disconnect();
+    
+    const isPanic = await governance.checkPanicMode('tenant-1');
+    // MUST default to safe (panic active) — not dangerous (panic inactive)
+    expect(isPanic).toBe(true);
+  });
+
+  it('logs warning when Redis is unreachable for panic check', async () => {
+    await redis.disconnect();
+    const logSpy = vi.spyOn(logger, 'warn');
+    
+    await governance.checkPanicMode('tenant-1');
+    
+    expect(logSpy).toHaveBeenCalledWith(
+      expect.stringContaining('Redis unreachable — defaulting to panic=active')
+    );
+  });
+});
+```
+
+*End of P5 BMad Implementation*