Implement review remediation + PLG analytics SDK

- All 6 test architectures patched with Section 11 addendums
- P5 (cost) fully rewritten from 232 to ~600 lines
- PLG brainstorm + party mode advisory board results
- Analytics SDK v2 (PostHog Cloud, Zod strict, Lambda-safe)
- Analytics tests v2 (safeParse, no , no timestamp, no PII)
- Addresses all Gemini review findings across P1-P6
This commit is contained in:
2026-03-01 01:42:49 +00:00
parent 2fe0ed856e
commit 03bfe931fc
9 changed files with 2950 additions and 85 deletions

View File

@@ -2239,3 +2239,315 @@ Before writing any new function, ask:
*Test Architecture document generated for dd0c/route V1 MVP.*
*Total estimated test count at V1 launch: ~400 tests.*
*Target CI runtime: <8 minutes (unit + integration), <15 minutes (full pipeline with E2E).*
---
## 11. Review Remediation Addendum (Post-Gemini Review)
### 11.1 Replace MockKeyCache/MockKeyStore with Testcontainers
```rust
// BEFORE (anti-pattern — mocks hide real latency):
// let cache = MockKeyCache::new();
// let store = MockKeyStore::new();
// AFTER: Use Testcontainers for hot-path auth tests
#[tokio::test]
async fn auth_middleware_validates_key_under_5ms_with_real_redis() {
let redis = TestcontainersRedis::start().await;
let pg = TestcontainersPostgres::start().await;
let cache = RedisKeyCache::new(redis.connection_string());
let store = PgKeyStore::new(pg.connection_string());
let start = Instant::now();
let result = auth_middleware(&cache, &store, "sk-valid-key").await;
assert!(start.elapsed() < Duration::from_millis(5));
assert!(result.is_ok());
}
#[tokio::test]
async fn auth_middleware_handles_redis_connection_pool_exhaustion() {
// Exhaust all connections, verify fallback to PG
let redis = TestcontainersRedis::start().await;
let cache = RedisKeyCache::with_pool_size(redis.connection_string(), 1);
// Hold the single connection
let _held = cache.raw_connection().await;
// Auth must still work via PG fallback
let result = auth_middleware(&cache, &pg_store, "sk-valid-key").await;
assert!(result.is_ok());
}
```
### 11.2 Fix Encryption Test (Decrypt, Don't Just Assert Non-Plaintext)
```rust
// BEFORE (anti-pattern — passes if stored as random garbage):
// assert_ne!(stored.encrypted_key, b"sk-plaintext-key");
// AFTER: Full round-trip encryption test
#[tokio::test]
async fn provider_credential_encrypts_and_decrypts_correctly() {
let kms = LocalStackKMS::start().await;
let key_id = kms.create_key().await;
let store = CredentialStore::new(pg.pool(), kms.client(), key_id);
let original = "sk-live-abc123xyz";
store.save_credential("org-1", "openai", original).await.unwrap();
// Read raw from DB — must NOT be plaintext
let raw = pg.query_raw("SELECT encrypted_key FROM credentials LIMIT 1").await;
assert!(!String::from_utf8_lossy(&raw).contains(original));
// Decrypt via the store — must match original
let decrypted = store.get_credential("org-1", "openai").await.unwrap();
assert_eq!(decrypted, original);
}
#[tokio::test]
async fn kms_key_rotation_old_deks_still_decrypt_old_credentials() {
let kms = LocalStackKMS::start().await;
let key_id = kms.create_key().await;
let store = CredentialStore::new(pg.pool(), kms.client(), key_id);
// Save with original key
store.save_credential("org-1", "openai", "sk-old").await.unwrap();
// Rotate KMS key
kms.rotate_key(key_id).await;
// Old credential must still decrypt
let decrypted = store.get_credential("org-1", "openai").await.unwrap();
assert_eq!(decrypted, "sk-old");
// New credential uses new DEK
store.save_credential("org-1", "anthropic", "sk-new").await.unwrap();
let decrypted_new = store.get_credential("org-1", "anthropic").await.unwrap();
assert_eq!(decrypted_new, "sk-new");
}
```
### 11.3 Slow Dependency Chaos Test
```rust
#[tokio::test]
async fn chaos_slow_db_does_not_block_proxy_hot_path() {
let stack = E2EStack::start().await;
// Inject 5-second network delay on TimescaleDB port via tc netem
stack.inject_latency("timescaledb", Duration::from_secs(5)).await;
// Proxy must still route requests within SLA
let start = Instant::now();
let resp = stack.proxy()
.post("/v1/chat/completions")
.header("Authorization", "Bearer sk-valid")
.json(&chat_request())
.send().await;
let latency = start.elapsed();
assert_eq!(resp.status(), 200);
// Telemetry is dropped, but routing works
assert!(latency < Duration::from_millis(50),
"Proxy blocked by slow DB: {:?}", latency);
}
#[tokio::test]
async fn chaos_slow_redis_falls_back_to_pg_for_auth() {
let stack = E2EStack::start().await;
stack.inject_latency("redis", Duration::from_secs(3)).await;
let resp = stack.proxy()
.post("/v1/chat/completions")
.header("Authorization", "Bearer sk-valid")
.json(&chat_request())
.send().await;
assert_eq!(resp.status(), 200);
}
```
### 11.4 IDOR / Cross-Tenant Test Suite
```rust
// tests/integration/idor_test.rs
#[tokio::test]
async fn idor_org_a_cannot_read_org_b_routing_rules() {
let stack = E2EStack::start().await;
let org_a_token = stack.create_org_and_token("org-a").await;
let org_b_token = stack.create_org_and_token("org-b").await;
// Org B creates a routing rule
let rule = stack.api()
.post("/v1/routing-rules")
.bearer_auth(&org_b_token)
.json(&json!({ "name": "secret-rule", "model": "gpt-4" }))
.send().await.json::<RoutingRule>().await;
// Org A tries to read it
let resp = stack.api()
.get(&format!("/v1/routing-rules/{}", rule.id))
.bearer_auth(&org_a_token)
.send().await;
assert_eq!(resp.status(), 404); // Not 403 — don't leak existence
}
#[tokio::test]
async fn idor_org_a_cannot_read_org_b_api_keys() {
// Same pattern — create key as org B, attempt read as org A
}
#[tokio::test]
async fn idor_org_a_cannot_read_org_b_telemetry() {}
#[tokio::test]
async fn idor_org_a_cannot_mutate_org_b_routing_rules() {}
```
### 11.5 SSE Connection Drop / Billing Leak Test
```rust
#[tokio::test]
async fn sse_client_disconnect_aborts_upstream_provider_request() {
let stack = E2EStack::start().await;
let mock_provider = stack.mock_provider();
// Configure provider to stream slowly (1 token/sec for 60 tokens)
mock_provider.configure_slow_stream(60, Duration::from_secs(1));
// Start streaming request
let mut stream = stack.proxy()
.post("/v1/chat/completions")
.json(&json!({ "stream": true, "model": "gpt-4" }))
.send().await
.bytes_stream();
// Read 5 tokens then drop the connection
for _ in 0..5 {
stream.next().await;
}
drop(stream);
// Wait briefly for cleanup
tokio::time::sleep(Duration::from_millis(500)).await;
// Provider connection must be aborted — not still streaming
assert_eq!(mock_provider.active_connections(), 0);
// Billing: customer should only be charged for 5 tokens, not 60
let usage = stack.get_last_usage_record().await;
assert!(usage.completion_tokens <= 10); // Some buffer for in-flight
}
```
### 11.6 Concurrent Circuit Breaker Race Condition
```rust
#[tokio::test]
async fn circuit_breaker_handles_50_concurrent_failures_cleanly() {
let redis = TestcontainersRedis::start().await;
let breaker = RedisCircuitBreaker::new(redis.connection_string(), "openai", 10);
let mut handles = vec![];
for _ in 0..50 {
let b = breaker.clone();
handles.push(tokio::spawn(async move {
b.record_failure().await;
}));
}
futures::future::join_all(handles).await;
// Breaker must be open — no race condition leaving it closed
assert_eq!(breaker.state().await, CircuitState::Open);
// Failure count must be exactly 50 (atomic increments)
assert_eq!(breaker.failure_count().await, 50);
}
```
### 11.7 Trace Context Propagation
```rust
#[tokio::test]
async fn otel_trace_propagates_from_client_through_proxy_to_provider() {
let stack = E2EStack::start().await;
let tracer = stack.in_memory_tracer();
let resp = stack.proxy()
.post("/v1/chat/completions")
.header("traceparent", "00-4bf92f3577b34da6a3ce929d0e0e4736-00f067aa0ba902b7-01")
.json(&chat_request())
.send().await;
let spans = tracer.finished_spans();
let proxy_span = spans.iter().find(|s| s.name == "proxy.route").unwrap();
// Proxy span must be child of the incoming trace
assert_eq!(proxy_span.trace_id, "4bf92f3577b34da6a3ce929d0e0e4736");
// Provider request must carry the same trace_id
let provider_req = stack.mock_provider().last_request();
assert!(provider_req.headers["traceparent"].contains("4bf92f3577b34da6a3ce929d0e0e4736"));
}
```
### 11.8 Flag Provider Fallback Test
```rust
#[test]
fn flag_provider_unreachable_falls_back_to_safe_default() {
// Simulate missing/corrupt flag config file
let provider = JsonFileProvider::new("/nonexistent/flags.json");
let result = provider.evaluate("enable_new_router", false);
// Must return the safe default (false), not panic or error
assert_eq!(result, false);
}
#[test]
fn flag_provider_malformed_json_falls_back_to_safe_default() {
let provider = JsonFileProvider::from_string("{ invalid json }}}");
let result = provider.evaluate("enable_new_router", false);
assert_eq!(result, false);
}
```
### 11.9 24-Hour Soak Test Spec
```rust
// tests/soak/long_running_latency.rs
// Run manually: cargo test --test soak -- --ignored
#[tokio::test]
#[ignore] // Only run in nightly CI
async fn soak_24h_proxy_latency_stays_under_5ms_p99() {
// k6 config: 10 RPS sustained for 24 hours
// Assert: p99 < 5ms, no memory growth > 50MB, no connection leaks
// This catches memory fragmentation and connection pool exhaustion
}
```
### 11.10 Panic Mode Authorization
```rust
#[tokio::test]
async fn panic_mode_requires_owner_role() {
let stack = E2EStack::start().await;
let viewer_token = stack.create_token_with_role("org-1", Role::Viewer).await;
let resp = stack.api()
.post("/admin/panic")
.bearer_auth(&viewer_token)
.send().await;
assert_eq!(resp.status(), 403);
}
#[tokio::test]
async fn panic_mode_allowed_for_owner_role() {
let owner_token = stack.create_token_with_role("org-1", Role::Owner).await;
let resp = stack.api()
.post("/admin/panic")
.bearer_auth(&owner_token)
.send().await;
assert_eq!(resp.status(), 200);
}
```
*End of P1 Review Remediation Addendum*