Implement review remediation + PLG analytics SDK
- All 6 test architectures patched with Section 11 addendums - P5 (cost) fully rewritten from 232 to ~600 lines - PLG brainstorm + party mode advisory board results - Analytics SDK v2 (PostHog Cloud, Zod strict, Lambda-safe) - Analytics tests v2 (safeParse, no , no timestamp, no PII) - Addresses all Gemini review findings across P1-P6
This commit is contained in:
138
products/01-llm-cost-router/src/analytics/index.ts
Normal file
138
products/01-llm-cost-router/src/analytics/index.ts
Normal file
@@ -0,0 +1,138 @@
|
|||||||
|
import { PostHog } from 'posthog-node';
|
||||||
|
import { z } from 'zod';
|
||||||
|
|
||||||
|
// ---------------------------------------------------------
|
||||||
|
// 1. Unified Event Taxonomy (Zod Enforced, Strictly Typed)
|
||||||
|
// ---------------------------------------------------------
|
||||||
|
|
||||||
|
export enum EventName {
|
||||||
|
SignupCompleted = 'account.signup.completed',
|
||||||
|
FirstDollarSaved = 'routing.savings.first_dollar',
|
||||||
|
UpgradeCompleted = 'billing.upgrade.completed',
|
||||||
|
}
|
||||||
|
|
||||||
|
// Per-event property schemas — no z.any() PII loophole
|
||||||
|
const SignupProperties = z.object({
|
||||||
|
method: z.enum(['github_sso', 'google_sso', 'email']),
|
||||||
|
}).strict();
|
||||||
|
|
||||||
|
const ActivationProperties = z.object({
|
||||||
|
savings_amount: z.number().nonnegative(),
|
||||||
|
}).strict();
|
||||||
|
|
||||||
|
const UpgradeProperties = z.object({
|
||||||
|
plan: z.enum(['pro', 'business']),
|
||||||
|
mrr_increase: z.number().nonnegative(),
|
||||||
|
}).strict();
|
||||||
|
|
||||||
|
const PropertiesMap = {
|
||||||
|
[EventName.SignupCompleted]: SignupProperties,
|
||||||
|
[EventName.FirstDollarSaved]: ActivationProperties,
|
||||||
|
[EventName.UpgradeCompleted]: UpgradeProperties,
|
||||||
|
} as const;
|
||||||
|
|
||||||
|
export const EventSchema = z.object({
|
||||||
|
name: z.nativeEnum(EventName),
|
||||||
|
tenant_id: z.string().min(1, 'tenant_id is required'),
|
||||||
|
product: z.literal('route'),
|
||||||
|
properties: z.record(z.unknown()).optional().default({}),
|
||||||
|
});
|
||||||
|
|
||||||
|
export type AnalyticsEvent = z.infer<typeof EventSchema>;
|
||||||
|
|
||||||
|
// ---------------------------------------------------------
|
||||||
|
// 2. NoOp Client for local/test environments
|
||||||
|
// ---------------------------------------------------------
|
||||||
|
|
||||||
|
class NoOpPostHog {
|
||||||
|
capture() {}
|
||||||
|
identify() {}
|
||||||
|
async flushAsync() {}
|
||||||
|
async shutdown() {}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------
|
||||||
|
// 3. Analytics SDK (PostHog Cloud, Lambda-Safe)
|
||||||
|
// ---------------------------------------------------------
|
||||||
|
|
||||||
|
export class Analytics {
|
||||||
|
private client: PostHog | NoOpPostHog;
|
||||||
|
public readonly isSessionReplayEnabled = false;
|
||||||
|
|
||||||
|
constructor(client?: PostHog) {
|
||||||
|
if (client) {
|
||||||
|
this.client = client;
|
||||||
|
} else {
|
||||||
|
const apiKey = process.env.POSTHOG_API_KEY;
|
||||||
|
if (!apiKey) {
|
||||||
|
// No key = NoOp. Never silently send to a mock key.
|
||||||
|
console.warn('[Analytics] POSTHOG_API_KEY not set — using NoOp client');
|
||||||
|
this.client = new NoOpPostHog();
|
||||||
|
} else {
|
||||||
|
this.client = new PostHog(apiKey, {
|
||||||
|
host: 'https://us.i.posthog.com',
|
||||||
|
flushAt: 20, // Batch up to 20 events
|
||||||
|
flushInterval: 5000, // Or flush every 5s
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Identify a tenant once (on signup). Sets $set properties.
|
||||||
|
* Call this instead of embedding $set in every track() call.
|
||||||
|
*/
|
||||||
|
public identify(tenantId: string, properties?: Record<string, unknown>): void {
|
||||||
|
this.client.identify({
|
||||||
|
distinctId: tenantId,
|
||||||
|
properties: { tenant_id: tenantId, ...properties },
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Track an event. Uses safeParse — never crashes the caller.
|
||||||
|
* Does NOT flush. Call flush() at Lambda teardown.
|
||||||
|
*/
|
||||||
|
public track(event: AnalyticsEvent): boolean {
|
||||||
|
// 1. Base schema validation
|
||||||
|
const baseResult = EventSchema.safeParse(event);
|
||||||
|
if (!baseResult.success) {
|
||||||
|
console.error('[Analytics] Invalid event (base):', baseResult.error.format());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. Per-event property validation (strict, no PII loophole)
|
||||||
|
const propSchema = PropertiesMap[baseResult.data.name];
|
||||||
|
if (propSchema) {
|
||||||
|
const propResult = propSchema.safeParse(baseResult.data.properties);
|
||||||
|
if (!propResult.success) {
|
||||||
|
console.error('[Analytics] Invalid properties:', propResult.error.format());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. Capture — let PostHog assign the timestamp (avoids clock skew)
|
||||||
|
this.client.capture({
|
||||||
|
distinctId: baseResult.data.tenant_id,
|
||||||
|
event: baseResult.data.name,
|
||||||
|
properties: {
|
||||||
|
product: baseResult.data.product,
|
||||||
|
...baseResult.data.properties,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Flush all queued events. Call once at Lambda teardown
|
||||||
|
* (e.g., in a Middy middleware or handler's finally block).
|
||||||
|
*/
|
||||||
|
public async flush(): Promise<void> {
|
||||||
|
await this.client.flushAsync();
|
||||||
|
}
|
||||||
|
|
||||||
|
public async shutdown(): Promise<void> {
|
||||||
|
await this.client.shutdown();
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -2239,3 +2239,315 @@ Before writing any new function, ask:
|
|||||||
*Test Architecture document generated for dd0c/route V1 MVP.*
|
*Test Architecture document generated for dd0c/route V1 MVP.*
|
||||||
*Total estimated test count at V1 launch: ~400 tests.*
|
*Total estimated test count at V1 launch: ~400 tests.*
|
||||||
*Target CI runtime: <8 minutes (unit + integration), <15 minutes (full pipeline with E2E).*
|
*Target CI runtime: <8 minutes (unit + integration), <15 minutes (full pipeline with E2E).*
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 11. Review Remediation Addendum (Post-Gemini Review)
|
||||||
|
|
||||||
|
### 11.1 Replace MockKeyCache/MockKeyStore with Testcontainers
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// BEFORE (anti-pattern — mocks hide real latency):
|
||||||
|
// let cache = MockKeyCache::new();
|
||||||
|
// let store = MockKeyStore::new();
|
||||||
|
|
||||||
|
// AFTER: Use Testcontainers for hot-path auth tests
|
||||||
|
#[tokio::test]
|
||||||
|
async fn auth_middleware_validates_key_under_5ms_with_real_redis() {
|
||||||
|
let redis = TestcontainersRedis::start().await;
|
||||||
|
let pg = TestcontainersPostgres::start().await;
|
||||||
|
let cache = RedisKeyCache::new(redis.connection_string());
|
||||||
|
let store = PgKeyStore::new(pg.connection_string());
|
||||||
|
|
||||||
|
let start = Instant::now();
|
||||||
|
let result = auth_middleware(&cache, &store, "sk-valid-key").await;
|
||||||
|
assert!(start.elapsed() < Duration::from_millis(5));
|
||||||
|
assert!(result.is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn auth_middleware_handles_redis_connection_pool_exhaustion() {
|
||||||
|
// Exhaust all connections, verify fallback to PG
|
||||||
|
let redis = TestcontainersRedis::start().await;
|
||||||
|
let cache = RedisKeyCache::with_pool_size(redis.connection_string(), 1);
|
||||||
|
// Hold the single connection
|
||||||
|
let _held = cache.raw_connection().await;
|
||||||
|
// Auth must still work via PG fallback
|
||||||
|
let result = auth_middleware(&cache, &pg_store, "sk-valid-key").await;
|
||||||
|
assert!(result.is_ok());
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.2 Fix Encryption Test (Decrypt, Don't Just Assert Non-Plaintext)
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// BEFORE (anti-pattern — passes if stored as random garbage):
|
||||||
|
// assert_ne!(stored.encrypted_key, b"sk-plaintext-key");
|
||||||
|
|
||||||
|
// AFTER: Full round-trip encryption test
|
||||||
|
#[tokio::test]
|
||||||
|
async fn provider_credential_encrypts_and_decrypts_correctly() {
|
||||||
|
let kms = LocalStackKMS::start().await;
|
||||||
|
let key_id = kms.create_key().await;
|
||||||
|
let store = CredentialStore::new(pg.pool(), kms.client(), key_id);
|
||||||
|
|
||||||
|
let original = "sk-live-abc123xyz";
|
||||||
|
store.save_credential("org-1", "openai", original).await.unwrap();
|
||||||
|
|
||||||
|
// Read raw from DB — must NOT be plaintext
|
||||||
|
let raw = pg.query_raw("SELECT encrypted_key FROM credentials LIMIT 1").await;
|
||||||
|
assert!(!String::from_utf8_lossy(&raw).contains(original));
|
||||||
|
|
||||||
|
// Decrypt via the store — must match original
|
||||||
|
let decrypted = store.get_credential("org-1", "openai").await.unwrap();
|
||||||
|
assert_eq!(decrypted, original);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn kms_key_rotation_old_deks_still_decrypt_old_credentials() {
|
||||||
|
let kms = LocalStackKMS::start().await;
|
||||||
|
let key_id = kms.create_key().await;
|
||||||
|
let store = CredentialStore::new(pg.pool(), kms.client(), key_id);
|
||||||
|
|
||||||
|
// Save with original key
|
||||||
|
store.save_credential("org-1", "openai", "sk-old").await.unwrap();
|
||||||
|
|
||||||
|
// Rotate KMS key
|
||||||
|
kms.rotate_key(key_id).await;
|
||||||
|
|
||||||
|
// Old credential must still decrypt
|
||||||
|
let decrypted = store.get_credential("org-1", "openai").await.unwrap();
|
||||||
|
assert_eq!(decrypted, "sk-old");
|
||||||
|
|
||||||
|
// New credential uses new DEK
|
||||||
|
store.save_credential("org-1", "anthropic", "sk-new").await.unwrap();
|
||||||
|
let decrypted_new = store.get_credential("org-1", "anthropic").await.unwrap();
|
||||||
|
assert_eq!(decrypted_new, "sk-new");
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.3 Slow Dependency Chaos Test
|
||||||
|
|
||||||
|
```rust
|
||||||
|
#[tokio::test]
|
||||||
|
async fn chaos_slow_db_does_not_block_proxy_hot_path() {
|
||||||
|
let stack = E2EStack::start().await;
|
||||||
|
|
||||||
|
// Inject 5-second network delay on TimescaleDB port via tc netem
|
||||||
|
stack.inject_latency("timescaledb", Duration::from_secs(5)).await;
|
||||||
|
|
||||||
|
// Proxy must still route requests within SLA
|
||||||
|
let start = Instant::now();
|
||||||
|
let resp = stack.proxy()
|
||||||
|
.post("/v1/chat/completions")
|
||||||
|
.header("Authorization", "Bearer sk-valid")
|
||||||
|
.json(&chat_request())
|
||||||
|
.send().await;
|
||||||
|
let latency = start.elapsed();
|
||||||
|
|
||||||
|
assert_eq!(resp.status(), 200);
|
||||||
|
// Telemetry is dropped, but routing works
|
||||||
|
assert!(latency < Duration::from_millis(50),
|
||||||
|
"Proxy blocked by slow DB: {:?}", latency);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn chaos_slow_redis_falls_back_to_pg_for_auth() {
|
||||||
|
let stack = E2EStack::start().await;
|
||||||
|
stack.inject_latency("redis", Duration::from_secs(3)).await;
|
||||||
|
|
||||||
|
let resp = stack.proxy()
|
||||||
|
.post("/v1/chat/completions")
|
||||||
|
.header("Authorization", "Bearer sk-valid")
|
||||||
|
.json(&chat_request())
|
||||||
|
.send().await;
|
||||||
|
assert_eq!(resp.status(), 200);
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.4 IDOR / Cross-Tenant Test Suite
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// tests/integration/idor_test.rs
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn idor_org_a_cannot_read_org_b_routing_rules() {
|
||||||
|
let stack = E2EStack::start().await;
|
||||||
|
let org_a_token = stack.create_org_and_token("org-a").await;
|
||||||
|
let org_b_token = stack.create_org_and_token("org-b").await;
|
||||||
|
|
||||||
|
// Org B creates a routing rule
|
||||||
|
let rule = stack.api()
|
||||||
|
.post("/v1/routing-rules")
|
||||||
|
.bearer_auth(&org_b_token)
|
||||||
|
.json(&json!({ "name": "secret-rule", "model": "gpt-4" }))
|
||||||
|
.send().await.json::<RoutingRule>().await;
|
||||||
|
|
||||||
|
// Org A tries to read it
|
||||||
|
let resp = stack.api()
|
||||||
|
.get(&format!("/v1/routing-rules/{}", rule.id))
|
||||||
|
.bearer_auth(&org_a_token)
|
||||||
|
.send().await;
|
||||||
|
assert_eq!(resp.status(), 404); // Not 403 — don't leak existence
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn idor_org_a_cannot_read_org_b_api_keys() {
|
||||||
|
// Same pattern — create key as org B, attempt read as org A
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn idor_org_a_cannot_read_org_b_telemetry() {}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn idor_org_a_cannot_mutate_org_b_routing_rules() {}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.5 SSE Connection Drop / Billing Leak Test
|
||||||
|
|
||||||
|
```rust
|
||||||
|
#[tokio::test]
|
||||||
|
async fn sse_client_disconnect_aborts_upstream_provider_request() {
|
||||||
|
let stack = E2EStack::start().await;
|
||||||
|
let mock_provider = stack.mock_provider();
|
||||||
|
|
||||||
|
// Configure provider to stream slowly (1 token/sec for 60 tokens)
|
||||||
|
mock_provider.configure_slow_stream(60, Duration::from_secs(1));
|
||||||
|
|
||||||
|
// Start streaming request
|
||||||
|
let mut stream = stack.proxy()
|
||||||
|
.post("/v1/chat/completions")
|
||||||
|
.json(&json!({ "stream": true, "model": "gpt-4" }))
|
||||||
|
.send().await
|
||||||
|
.bytes_stream();
|
||||||
|
|
||||||
|
// Read 5 tokens then drop the connection
|
||||||
|
for _ in 0..5 {
|
||||||
|
stream.next().await;
|
||||||
|
}
|
||||||
|
drop(stream);
|
||||||
|
|
||||||
|
// Wait briefly for cleanup
|
||||||
|
tokio::time::sleep(Duration::from_millis(500)).await;
|
||||||
|
|
||||||
|
// Provider connection must be aborted — not still streaming
|
||||||
|
assert_eq!(mock_provider.active_connections(), 0);
|
||||||
|
|
||||||
|
// Billing: customer should only be charged for 5 tokens, not 60
|
||||||
|
let usage = stack.get_last_usage_record().await;
|
||||||
|
assert!(usage.completion_tokens <= 10); // Some buffer for in-flight
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.6 Concurrent Circuit Breaker Race Condition
|
||||||
|
|
||||||
|
```rust
|
||||||
|
#[tokio::test]
|
||||||
|
async fn circuit_breaker_handles_50_concurrent_failures_cleanly() {
|
||||||
|
let redis = TestcontainersRedis::start().await;
|
||||||
|
let breaker = RedisCircuitBreaker::new(redis.connection_string(), "openai", 10);
|
||||||
|
|
||||||
|
let mut handles = vec![];
|
||||||
|
for _ in 0..50 {
|
||||||
|
let b = breaker.clone();
|
||||||
|
handles.push(tokio::spawn(async move {
|
||||||
|
b.record_failure().await;
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
futures::future::join_all(handles).await;
|
||||||
|
|
||||||
|
// Breaker must be open — no race condition leaving it closed
|
||||||
|
assert_eq!(breaker.state().await, CircuitState::Open);
|
||||||
|
// Failure count must be exactly 50 (atomic increments)
|
||||||
|
assert_eq!(breaker.failure_count().await, 50);
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.7 Trace Context Propagation
|
||||||
|
|
||||||
|
```rust
|
||||||
|
#[tokio::test]
|
||||||
|
async fn otel_trace_propagates_from_client_through_proxy_to_provider() {
|
||||||
|
let stack = E2EStack::start().await;
|
||||||
|
let tracer = stack.in_memory_tracer();
|
||||||
|
|
||||||
|
let resp = stack.proxy()
|
||||||
|
.post("/v1/chat/completions")
|
||||||
|
.header("traceparent", "00-4bf92f3577b34da6a3ce929d0e0e4736-00f067aa0ba902b7-01")
|
||||||
|
.json(&chat_request())
|
||||||
|
.send().await;
|
||||||
|
|
||||||
|
let spans = tracer.finished_spans();
|
||||||
|
let proxy_span = spans.iter().find(|s| s.name == "proxy.route").unwrap();
|
||||||
|
|
||||||
|
// Proxy span must be child of the incoming trace
|
||||||
|
assert_eq!(proxy_span.trace_id, "4bf92f3577b34da6a3ce929d0e0e4736");
|
||||||
|
|
||||||
|
// Provider request must carry the same trace_id
|
||||||
|
let provider_req = stack.mock_provider().last_request();
|
||||||
|
assert!(provider_req.headers["traceparent"].contains("4bf92f3577b34da6a3ce929d0e0e4736"));
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.8 Flag Provider Fallback Test
|
||||||
|
|
||||||
|
```rust
|
||||||
|
#[test]
|
||||||
|
fn flag_provider_unreachable_falls_back_to_safe_default() {
|
||||||
|
// Simulate missing/corrupt flag config file
|
||||||
|
let provider = JsonFileProvider::new("/nonexistent/flags.json");
|
||||||
|
let result = provider.evaluate("enable_new_router", false);
|
||||||
|
// Must return the safe default (false), not panic or error
|
||||||
|
assert_eq!(result, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn flag_provider_malformed_json_falls_back_to_safe_default() {
|
||||||
|
let provider = JsonFileProvider::from_string("{ invalid json }}}");
|
||||||
|
let result = provider.evaluate("enable_new_router", false);
|
||||||
|
assert_eq!(result, false);
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.9 24-Hour Soak Test Spec
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// tests/soak/long_running_latency.rs
|
||||||
|
// Run manually: cargo test --test soak -- --ignored
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
#[ignore] // Only run in nightly CI
|
||||||
|
async fn soak_24h_proxy_latency_stays_under_5ms_p99() {
|
||||||
|
// k6 config: 10 RPS sustained for 24 hours
|
||||||
|
// Assert: p99 < 5ms, no memory growth > 50MB, no connection leaks
|
||||||
|
// This catches memory fragmentation and connection pool exhaustion
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.10 Panic Mode Authorization
|
||||||
|
|
||||||
|
```rust
|
||||||
|
#[tokio::test]
|
||||||
|
async fn panic_mode_requires_owner_role() {
|
||||||
|
let stack = E2EStack::start().await;
|
||||||
|
let viewer_token = stack.create_token_with_role("org-1", Role::Viewer).await;
|
||||||
|
|
||||||
|
let resp = stack.api()
|
||||||
|
.post("/admin/panic")
|
||||||
|
.bearer_auth(&viewer_token)
|
||||||
|
.send().await;
|
||||||
|
assert_eq!(resp.status(), 403);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn panic_mode_allowed_for_owner_role() {
|
||||||
|
let owner_token = stack.create_token_with_role("org-1", Role::Owner).await;
|
||||||
|
let resp = stack.api()
|
||||||
|
.post("/admin/panic")
|
||||||
|
.bearer_auth(&owner_token)
|
||||||
|
.send().await;
|
||||||
|
assert_eq!(resp.status(), 200);
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
*End of P1 Review Remediation Addendum*
|
||||||
|
|||||||
204
products/01-llm-cost-router/tests/analytics/analytics.spec.ts
Normal file
204
products/01-llm-cost-router/tests/analytics/analytics.spec.ts
Normal file
@@ -0,0 +1,204 @@
|
|||||||
|
import { describe, it, expect, vi, beforeEach } from 'vitest';
|
||||||
|
import { Analytics, EventSchema, EventName } from '../../src/analytics';
|
||||||
|
import { PostHog } from 'posthog-node';
|
||||||
|
|
||||||
|
vi.mock('posthog-node');
|
||||||
|
|
||||||
|
describe('Analytics SDK (PostHog Cloud — v2 Post-Review)', () => {
|
||||||
|
let analytics: Analytics;
|
||||||
|
let mockPostHog: vi.Mocked<PostHog>;
|
||||||
|
|
||||||
|
beforeEach(() => {
|
||||||
|
vi.clearAllMocks();
|
||||||
|
mockPostHog = new PostHog('phc_test_key', { host: 'https://us.i.posthog.com' }) as any;
|
||||||
|
analytics = new Analytics(mockPostHog);
|
||||||
|
});
|
||||||
|
|
||||||
|
// ── Schema Validation (Zod) ──────────────────────────────
|
||||||
|
|
||||||
|
describe('Event Taxonomy Validation', () => {
|
||||||
|
it('accepts valid account.signup.completed event', () => {
|
||||||
|
const event = {
|
||||||
|
name: EventName.SignupCompleted,
|
||||||
|
tenant_id: 'tenant-123',
|
||||||
|
product: 'route' as const,
|
||||||
|
properties: { method: 'github_sso' },
|
||||||
|
};
|
||||||
|
expect(() => EventSchema.parse(event)).not.toThrow();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('rejects events missing tenant_id', () => {
|
||||||
|
const event = {
|
||||||
|
name: EventName.SignupCompleted,
|
||||||
|
product: 'route',
|
||||||
|
properties: { method: 'email' },
|
||||||
|
};
|
||||||
|
expect(() => EventSchema.parse(event as any)).toThrow(/tenant_id/);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('accepts valid activation event', () => {
|
||||||
|
const event = {
|
||||||
|
name: EventName.FirstDollarSaved,
|
||||||
|
tenant_id: 'tenant-123',
|
||||||
|
product: 'route' as const,
|
||||||
|
properties: { savings_amount: 1.50 },
|
||||||
|
};
|
||||||
|
expect(() => EventSchema.parse(event)).not.toThrow();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('accepts valid upgrade event', () => {
|
||||||
|
const event = {
|
||||||
|
name: EventName.UpgradeCompleted,
|
||||||
|
tenant_id: 'tenant-123',
|
||||||
|
product: 'route' as const,
|
||||||
|
properties: { plan: 'pro', mrr_increase: 49 },
|
||||||
|
};
|
||||||
|
expect(() => EventSchema.parse(event)).not.toThrow();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ── track() Behavior ─────────────────────────────────────
|
||||||
|
|
||||||
|
describe('track()', () => {
|
||||||
|
it('captures valid events via PostHog client', () => {
|
||||||
|
const result = analytics.track({
|
||||||
|
name: EventName.SignupCompleted,
|
||||||
|
tenant_id: 'tenant-123',
|
||||||
|
product: 'route',
|
||||||
|
properties: { method: 'email' },
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(result).toBe(true);
|
||||||
|
expect(mockPostHog.capture).toHaveBeenCalledWith(
|
||||||
|
expect.objectContaining({
|
||||||
|
distinctId: 'tenant-123',
|
||||||
|
event: 'account.signup.completed',
|
||||||
|
properties: expect.objectContaining({
|
||||||
|
product: 'route',
|
||||||
|
method: 'email',
|
||||||
|
}),
|
||||||
|
})
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('does NOT include $set in track calls (use identify instead)', () => {
|
||||||
|
analytics.track({
|
||||||
|
name: EventName.SignupCompleted,
|
||||||
|
tenant_id: 'tenant-123',
|
||||||
|
product: 'route',
|
||||||
|
properties: { method: 'github_sso' },
|
||||||
|
});
|
||||||
|
|
||||||
|
const captureCall = mockPostHog.capture.mock.calls[0][0];
|
||||||
|
expect(captureCall.properties).not.toHaveProperty('$set');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('does NOT pass timestamp (let PostHog handle it to avoid clock skew)', () => {
|
||||||
|
analytics.track({
|
||||||
|
name: EventName.SignupCompleted,
|
||||||
|
tenant_id: 'tenant-123',
|
||||||
|
product: 'route',
|
||||||
|
properties: { method: 'email' },
|
||||||
|
});
|
||||||
|
|
||||||
|
const captureCall = mockPostHog.capture.mock.calls[0][0];
|
||||||
|
expect(captureCall).not.toHaveProperty('timestamp');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns false and does NOT call PostHog if base validation fails', () => {
|
||||||
|
const result = analytics.track({
|
||||||
|
name: 'invalid.event' as any,
|
||||||
|
tenant_id: 'tenant-123',
|
||||||
|
product: 'route',
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(result).toBe(false);
|
||||||
|
expect(mockPostHog.capture).not.toHaveBeenCalled();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns false if per-event property validation fails (strict schema)', () => {
|
||||||
|
const result = analytics.track({
|
||||||
|
name: EventName.SignupCompleted,
|
||||||
|
tenant_id: 'tenant-123',
|
||||||
|
product: 'route',
|
||||||
|
properties: { method: 'invalid_method' }, // Not in enum
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(result).toBe(false);
|
||||||
|
expect(mockPostHog.capture).not.toHaveBeenCalled();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('rejects unknown properties (strict mode — no PII loophole)', () => {
|
||||||
|
const result = analytics.track({
|
||||||
|
name: EventName.SignupCompleted,
|
||||||
|
tenant_id: 'tenant-123',
|
||||||
|
product: 'route',
|
||||||
|
properties: { method: 'email', email: 'user@example.com' }, // PII leak attempt
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(result).toBe(false);
|
||||||
|
expect(mockPostHog.capture).not.toHaveBeenCalled();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('does NOT flush after each track call (Lambda batching)', () => {
|
||||||
|
analytics.track({
|
||||||
|
name: EventName.SignupCompleted,
|
||||||
|
tenant_id: 'tenant-123',
|
||||||
|
product: 'route',
|
||||||
|
properties: { method: 'email' },
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(mockPostHog.flushAsync).not.toHaveBeenCalled();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ── identify() ───────────────────────────────────────────
|
||||||
|
|
||||||
|
describe('identify()', () => {
|
||||||
|
it('calls PostHog identify with tenant_id as distinctId', () => {
|
||||||
|
analytics.identify('tenant-123', { company: 'Acme' });
|
||||||
|
|
||||||
|
expect(mockPostHog.identify).toHaveBeenCalledWith(
|
||||||
|
expect.objectContaining({
|
||||||
|
distinctId: 'tenant-123',
|
||||||
|
properties: expect.objectContaining({
|
||||||
|
tenant_id: 'tenant-123',
|
||||||
|
company: 'Acme',
|
||||||
|
}),
|
||||||
|
})
|
||||||
|
);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ── flush() ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
describe('flush()', () => {
|
||||||
|
it('calls flushAsync on the PostHog client', async () => {
|
||||||
|
await analytics.flush();
|
||||||
|
expect(mockPostHog.flushAsync).toHaveBeenCalledTimes(1);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ── NoOp Client ──────────────────────────────────────────
|
||||||
|
|
||||||
|
describe('NoOp Client (missing API key)', () => {
|
||||||
|
it('does not throw when tracking without API key', () => {
|
||||||
|
const noopAnalytics = new Analytics(); // No client, no env var
|
||||||
|
const result = noopAnalytics.track({
|
||||||
|
name: EventName.SignupCompleted,
|
||||||
|
tenant_id: 'tenant-123',
|
||||||
|
product: 'route',
|
||||||
|
properties: { method: 'email' },
|
||||||
|
});
|
||||||
|
expect(result).toBe(true); // NoOp accepts everything silently
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ── Session Replay ───────────────────────────────────────
|
||||||
|
|
||||||
|
describe('Security', () => {
|
||||||
|
it('session replay is disabled', () => {
|
||||||
|
expect(analytics.isSessionReplayEnabled).toBe(false);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -1727,3 +1727,370 @@ Before any code ships to production, these tests must be green:
|
|||||||
---
|
---
|
||||||
|
|
||||||
*Document complete. Total estimated test count at V1 launch: ~500 tests. Target by month 3: ~1,000 tests.*
|
*Document complete. Total estimated test count at V1 launch: ~500 tests. Target by month 3: ~1,000 tests.*
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 11. Review Remediation Addendum (Post-Gemini Review)
|
||||||
|
|
||||||
|
### 11.1 Missing Epic Coverage
|
||||||
|
|
||||||
|
#### Epic 6: Dashboard UI (React Testing Library + Playwright)
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
// tests/ui/components/DiffViewer.test.tsx
|
||||||
|
describe('DiffViewer Component', () => {
|
||||||
|
it('renders added lines in green', () => {});
|
||||||
|
it('renders removed lines in red', () => {});
|
||||||
|
it('renders unchanged lines in default color', () => {});
|
||||||
|
it('collapses large diffs with "Show more" toggle', () => {});
|
||||||
|
it('highlights HCL syntax in diff blocks', () => {});
|
||||||
|
it('shows resource type icon next to each drift item', () => {});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('StackOverview Component', () => {
|
||||||
|
it('renders drift count badge per stack', () => {});
|
||||||
|
it('sorts stacks by drift severity (critical first)', () => {});
|
||||||
|
it('shows last scan timestamp', () => {});
|
||||||
|
it('shows agent health indicator (green/yellow/red)', () => {});
|
||||||
|
});
|
||||||
|
|
||||||
|
// tests/e2e/ui/dashboard.spec.ts (Playwright)
|
||||||
|
test('OAuth login redirects to Cognito and back', async ({ page }) => {
|
||||||
|
await page.goto('/dashboard');
|
||||||
|
await expect(page).toHaveURL(/cognito/);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('stack list renders with drift counts', async ({ page }) => {
|
||||||
|
await page.goto('/dashboard/stacks');
|
||||||
|
await expect(page.locator('[data-testid="stack-card"]')).toHaveCountGreaterThan(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('diff viewer renders inline diff for Terraform resource', async ({ page }) => {
|
||||||
|
await page.goto('/dashboard/stacks/stack-1/drifts/drift-1');
|
||||||
|
await expect(page.locator('[data-testid="diff-viewer"]')).toBeVisible();
|
||||||
|
await expect(page.locator('.diff-added')).toHaveCountGreaterThan(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('revert button triggers confirmation modal', async ({ page }) => {
|
||||||
|
await page.goto('/dashboard/stacks/stack-1/drifts/drift-1');
|
||||||
|
await page.click('[data-testid="revert-btn"]');
|
||||||
|
await expect(page.locator('[data-testid="confirm-modal"]')).toBeVisible();
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Epic 9: Onboarding & PLG (Stripe + drift init)
|
||||||
|
|
||||||
|
```go
|
||||||
|
// pkg/onboarding/stripe_test.go
|
||||||
|
|
||||||
|
func TestStripeWebhookCheckoutCompleted_UpgradesTenant(t *testing.T) {}
|
||||||
|
func TestStripeWebhookSubscriptionDeleted_DowngradesTenant(t *testing.T) {}
|
||||||
|
func TestStripeWebhookInvalidSignature_Returns401(t *testing.T) {}
|
||||||
|
func TestStripeWebhookReplayedEvent_IsIdempotent(t *testing.T) {}
|
||||||
|
|
||||||
|
// pkg/agent/init_test.go
|
||||||
|
|
||||||
|
func TestDriftInit_DetectsTerraformInCurrentDir(t *testing.T) {}
|
||||||
|
func TestDriftInit_DetectsCloudFormationInCurrentDir(t *testing.T) {}
|
||||||
|
func TestDriftInit_DetectsPulumiInCurrentDir(t *testing.T) {}
|
||||||
|
func TestDriftInit_GeneratesValidYAMLConfig(t *testing.T) {}
|
||||||
|
func TestDriftInit_HandlesWindowsPaths(t *testing.T) {}
|
||||||
|
func TestDriftInit_HandlesMacPaths(t *testing.T) {}
|
||||||
|
func TestDriftInit_HandlesLinuxPaths(t *testing.T) {}
|
||||||
|
func TestDriftInit_FailsGracefullyOnEmptyDir(t *testing.T) {}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Epic 8: Infrastructure (Terratest)
|
||||||
|
|
||||||
|
```go
|
||||||
|
// tests/infra/terraform_test.go
|
||||||
|
|
||||||
|
func TestTerraformPlan_CreatesExpectedResources(t *testing.T) {
|
||||||
|
terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{
|
||||||
|
TerraformDir: "../../infra/terraform",
|
||||||
|
})
|
||||||
|
defer terraform.Destroy(t, terraformOptions)
|
||||||
|
terraform.InitAndPlan(t, terraformOptions)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTerraformApply_SQSFIFOQueueCreated(t *testing.T) {}
|
||||||
|
func TestTerraformApply_RDSInstanceCreated(t *testing.T) {}
|
||||||
|
func TestTerraformApply_IAMRolesHaveLeastPrivilege(t *testing.T) {
|
||||||
|
// Verify no IAM policy has Action: "*"
|
||||||
|
}
|
||||||
|
func TestTerraformApply_VPCSecurityGroupsRestrictIngress(t *testing.T) {}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Epic 2: mTLS Certificate Lifecycle
|
||||||
|
|
||||||
|
```go
|
||||||
|
// pkg/agent/mtls_test.go
|
||||||
|
|
||||||
|
func TestMTLS_CertificateGeneration_ValidX509(t *testing.T) {}
|
||||||
|
func TestMTLS_CertificateExpiration_AgentRejectsExpiredCert(t *testing.T) {}
|
||||||
|
func TestMTLS_CertificateRotation_NewCertAcceptedMidConnection(t *testing.T) {}
|
||||||
|
func TestMTLS_CertificateRevocation_RevokedCertRejected(t *testing.T) {}
|
||||||
|
func TestMTLS_SelfSignedCert_RejectedBySaaS(t *testing.T) {}
|
||||||
|
func TestMTLS_CertificateChain_IntermediateCAValidated(t *testing.T) {}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.2 Add t.Parallel() to Table-Driven Tests
|
||||||
|
|
||||||
|
```go
|
||||||
|
// BEFORE (sequential — wastes CI time):
|
||||||
|
func TestSecretScrubber(t *testing.T) {
|
||||||
|
tests := []struct{ name, input, expected string }{...}
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
// runs sequentially
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// AFTER (parallel):
|
||||||
|
func TestSecretScrubber(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
tests := []struct{ name, input, expected string }{...}
|
||||||
|
for _, tt := range tests {
|
||||||
|
tt := tt // capture range variable
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
// runs in parallel
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.3 Dynamic Resource Naming for LocalStack
|
||||||
|
|
||||||
|
```go
|
||||||
|
// BEFORE (shared state — flaky):
|
||||||
|
// bucket := "drift-reports"
|
||||||
|
|
||||||
|
// AFTER (per-test isolation):
|
||||||
|
func uniqueBucket(t *testing.T) string {
|
||||||
|
return fmt.Sprintf("drift-reports-%s-%d", t.Name(), time.Now().UnixNano())
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDriftReportUpload(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
bucket := uniqueBucket(t)
|
||||||
|
s3Client.CreateBucket(ctx, &s3.CreateBucketInput{Bucket: &bucket})
|
||||||
|
// Test uses isolated bucket — no cross-test contamination
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.4 Distributed Tracing Cross-Boundary Tests
|
||||||
|
|
||||||
|
```go
|
||||||
|
// tests/integration/trace_propagation_test.go
|
||||||
|
|
||||||
|
func TestTraceContext_AgentToSaaS_SpanParentChain(t *testing.T) {
|
||||||
|
// Agent generates drift_scan span with trace_id
|
||||||
|
// POST /v1/drift-reports carries traceparent header
|
||||||
|
// SaaS Event Processor creates child span
|
||||||
|
// Verify parent-child relationship across HTTP boundary
|
||||||
|
|
||||||
|
exporter := tracetest.NewInMemoryExporter()
|
||||||
|
|
||||||
|
// Fire drift report with traceparent
|
||||||
|
traceID := "4bf92f3577b34da6a3ce929d0e0e4736"
|
||||||
|
resp := postDriftReport(t, stack, traceID)
|
||||||
|
assert.Equal(t, 200, resp.StatusCode)
|
||||||
|
|
||||||
|
spans := exporter.GetSpans()
|
||||||
|
eventProcessorSpan := findSpan(spans, "drift_report.process")
|
||||||
|
assert.Equal(t, traceID, eventProcessorSpan.SpanContext().TraceID().String())
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTraceContext_SQSBoundary_PreservesTraceID(t *testing.T) {
|
||||||
|
// Verify SQS message attributes contain traceparent
|
||||||
|
// Verify consumer extracts and continues the trace
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTraceContext_AgentScan_CreatesParentSpan(t *testing.T) {
|
||||||
|
// Verify agent drift_scan span has correct attributes:
|
||||||
|
// drift.stack_id, drift.resource_count, drift.duration_ms
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.5 Backward Compatibility Serialization (Elastic Schema)
|
||||||
|
|
||||||
|
```go
|
||||||
|
// tests/schema/backward_compat_test.go
|
||||||
|
|
||||||
|
func TestOldAgent_ParsesNewDynamoDBItem_WithV2Attributes(t *testing.T) {
|
||||||
|
// Simulate V2 DynamoDB item with new _v2 fields
|
||||||
|
item := map[string]types.AttributeValue{
|
||||||
|
"PK": &types.AttributeValueMemberS{Value: "STACK#123"},
|
||||||
|
"drift_score": &types.AttributeValueMemberN{Value: "85"},
|
||||||
|
"drift_score_v2": &types.AttributeValueMemberN{Value: "92"}, // New field
|
||||||
|
"remediation_v2": &types.AttributeValueMemberS{Value: "auto"}, // New field
|
||||||
|
}
|
||||||
|
|
||||||
|
// V1 parser must ignore unknown fields
|
||||||
|
result, err := ParseDriftItem(item)
|
||||||
|
assert.NoError(t, err)
|
||||||
|
assert.Equal(t, 85, result.DriftScore) // Uses V1 field
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestV1Code_ReadsV2Writes_DuringMigrationWindow(t *testing.T) {
|
||||||
|
// V2 writes both drift_score and drift_score_v2
|
||||||
|
// V1 reads drift_score (ignores _v2)
|
||||||
|
// Verify no data loss
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.6 Security: RBAC Forgery & Replay Attacks
|
||||||
|
|
||||||
|
```go
|
||||||
|
// tests/integration/security_test.go
|
||||||
|
|
||||||
|
func TestAgentCannotForgeStackID(t *testing.T) {
|
||||||
|
// Agent with API key for org-A sends drift report claiming stack belongs to org-B
|
||||||
|
orgAKey := createAPIKey(t, "org-a")
|
||||||
|
report := makeDriftReport("org-b-stack-id") // Wrong org
|
||||||
|
|
||||||
|
resp := postDriftReportWithKey(t, report, orgAKey)
|
||||||
|
assert.Equal(t, 403, resp.StatusCode)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestReplayAttack_DuplicateReportID_Rejected(t *testing.T) {
|
||||||
|
report := makeDriftReport("stack-1")
|
||||||
|
resp1 := postDriftReport(t, report)
|
||||||
|
assert.Equal(t, 200, resp1.StatusCode)
|
||||||
|
|
||||||
|
// Replay exact same report
|
||||||
|
resp2 := postDriftReport(t, report)
|
||||||
|
assert.Equal(t, 409, resp2.StatusCode) // Conflict — already processed
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestReplayAttack_OldTimestamp_Rejected(t *testing.T) {
|
||||||
|
report := makeDriftReport("stack-1")
|
||||||
|
report.Timestamp = time.Now().Add(-10 * time.Minute) // 10 min old
|
||||||
|
|
||||||
|
resp := postDriftReport(t, report)
|
||||||
|
assert.Equal(t, 400, resp.StatusCode) // Stale report
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.7 Noisy Neighbor & Fair-Share Processing
|
||||||
|
|
||||||
|
```go
|
||||||
|
// tests/integration/fair_share_test.go
|
||||||
|
|
||||||
|
func TestNoisyNeighbor_LargeOrgDoesNotStarveSmallOrg(t *testing.T) {
|
||||||
|
// Org A: 10,000 drifted resources
|
||||||
|
// Org B: 10 drifted resources
|
||||||
|
// Both submit reports simultaneously
|
||||||
|
|
||||||
|
seedDriftReports(t, "org-a", 10000)
|
||||||
|
seedDriftReports(t, "org-b", 10)
|
||||||
|
|
||||||
|
// Org B's reports must be processed within 30 seconds
|
||||||
|
// (not queued behind all 10K of Org A's)
|
||||||
|
start := time.Now()
|
||||||
|
waitForProcessed(t, "org-b", 10, 30*time.Second)
|
||||||
|
assert.Less(t, time.Since(start), 30*time.Second)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.8 Panic Mode Mid-Remediation Race Condition
|
||||||
|
|
||||||
|
```go
|
||||||
|
// tests/integration/panic_remediation_test.go
|
||||||
|
|
||||||
|
func TestPanicMode_AbortsInFlightRemediation(t *testing.T) {
|
||||||
|
// Start a remediation (terraform apply)
|
||||||
|
execID := startRemediation(t, "stack-1", "drift-1")
|
||||||
|
waitForState(t, execID, "applying")
|
||||||
|
|
||||||
|
// Trigger panic mode
|
||||||
|
triggerPanicMode(t)
|
||||||
|
|
||||||
|
// Remediation must be aborted, not completed
|
||||||
|
state := waitForState(t, execID, "aborted")
|
||||||
|
assert.Equal(t, "aborted", state)
|
||||||
|
|
||||||
|
// Verify terraform state is not corrupted
|
||||||
|
// (agent should have run terraform state pull to verify)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestPanicMode_DoesNotAbortReadOnlyScans(t *testing.T) {
|
||||||
|
// Drift scans (read-only) should continue during panic
|
||||||
|
// Only write operations (remediation) are halted
|
||||||
|
scanID := startDriftScan(t, "stack-1")
|
||||||
|
triggerPanicMode(t)
|
||||||
|
|
||||||
|
state := waitForState(t, scanID, "completed")
|
||||||
|
assert.Equal(t, "completed", state) // Scan finishes normally
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.9 Remediation vs. Concurrent Scan Race Condition
|
||||||
|
|
||||||
|
```go
|
||||||
|
func TestConcurrentScanDuringRemediation_DoesNotReportHalfAppliedState(t *testing.T) {
|
||||||
|
// Start remediation (terraform apply — takes ~30s)
|
||||||
|
execID := startRemediation(t, "stack-1", "drift-1")
|
||||||
|
waitForState(t, execID, "applying")
|
||||||
|
|
||||||
|
// Trigger a drift scan while remediation is in progress
|
||||||
|
scanID := startDriftScan(t, "stack-1")
|
||||||
|
|
||||||
|
// Scan must either:
|
||||||
|
// a) Wait for remediation to complete, OR
|
||||||
|
// b) Skip the stack with "remediation in progress" status
|
||||||
|
scanResult := waitForScanComplete(t, scanID)
|
||||||
|
assert.NotEqual(t, "half-applied", scanResult.Status)
|
||||||
|
// Must be either "skipped_remediation_in_progress" or show post-remediation state
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.10 SaaS API Memory Profiling
|
||||||
|
|
||||||
|
```go
|
||||||
|
// tests/load/memory_profile_test.go
|
||||||
|
|
||||||
|
func TestEventProcessor_DoesNotOOM_On1MB_DriftReport(t *testing.T) {
|
||||||
|
// Generate a 1MB drift report (1000 resources with large diffs)
|
||||||
|
report := makeLargeDriftReport(1000)
|
||||||
|
assert.Greater(t, len(report), 1024*1024)
|
||||||
|
|
||||||
|
var memBefore, memAfter runtime.MemStats
|
||||||
|
runtime.ReadMemStats(&memBefore)
|
||||||
|
|
||||||
|
processReport(t, report)
|
||||||
|
|
||||||
|
runtime.ReadMemStats(&memAfter)
|
||||||
|
growth := memAfter.Alloc - memBefore.Alloc
|
||||||
|
assert.Less(t, growth, uint64(50*1024*1024)) // <50MB growth
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.11 Trim E2E to Smoke Tier
|
||||||
|
|
||||||
|
Per review recommendation, cap E2E at 10 critical paths. Remaining 40 tests pushed to integration:
|
||||||
|
|
||||||
|
| E2E (Keep — 10 max) | Demoted to Integration |
|
||||||
|
|---------------------|----------------------|
|
||||||
|
| Onboarding: init → connect → first scan | Agent heartbeat variations |
|
||||||
|
| First drift detected → Slack alert | Individual parser format tests |
|
||||||
|
| Revert flow: Slack → agent apply → verify | Secret scrubber edge cases |
|
||||||
|
| Panic mode halts remediation | DynamoDB access pattern tests |
|
||||||
|
| Cross-tenant isolation | Individual webhook format tests |
|
||||||
|
| OAuth login → dashboard → view diff | Notification batching |
|
||||||
|
| Free tier limit enforcement | Agent config reload |
|
||||||
|
| Agent disconnect → reconnect → resume | Baseline score calculations |
|
||||||
|
| mTLS cert rotation mid-scan | Individual API endpoint tests |
|
||||||
|
| Stripe upgrade → unlock features | Cache invalidation patterns |
|
||||||
|
|
||||||
|
### 11.12 Updated Test Pyramid (Post-Review)
|
||||||
|
|
||||||
|
| Level | Original | Revised | Rationale |
|
||||||
|
|-------|----------|---------|-----------|
|
||||||
|
| Unit | 70% (~350) | 65% (~350) | Add t.Parallel(), keep count but add UI component tests |
|
||||||
|
| Integration | 20% (~100) | 28% (~150) | Terratest, mTLS, trace propagation, fair-share, security |
|
||||||
|
| E2E/Smoke | 10% (~50) | 7% (~35) | Capped at 10 true E2E + 25 Playwright UI tests |
|
||||||
|
|
||||||
|
*End of P2 Review Remediation Addendum*
|
||||||
|
|||||||
@@ -1409,3 +1409,459 @@ Before any release, these tests must pass:
|
|||||||
---
|
---
|
||||||
|
|
||||||
*End of dd0c/alert Test Architecture*
|
*End of dd0c/alert Test Architecture*
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 11. Review Remediation Addendum (Post-Gemini Review)
|
||||||
|
|
||||||
|
### 11.1 Missing Epic Coverage
|
||||||
|
|
||||||
|
#### Epic 6: Dashboard API
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
describe('Dashboard API', () => {
|
||||||
|
describe('Authentication', () => {
|
||||||
|
it('returns 401 for missing Cognito JWT', async () => {});
|
||||||
|
it('returns 401 for expired JWT', async () => {});
|
||||||
|
it('returns 401 for JWT signed by wrong issuer', async () => {});
|
||||||
|
it('extracts tenantId from JWT claims', async () => {});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Incident Listing (GET /v1/incidents)', () => {
|
||||||
|
it('returns paginated incidents for authenticated tenant', async () => {});
|
||||||
|
it('supports cursor-based pagination', async () => {});
|
||||||
|
it('filters by status (open, acknowledged, resolved)', async () => {});
|
||||||
|
it('filters by severity (critical, warning, info)', async () => {});
|
||||||
|
it('filters by time range (since, until)', async () => {});
|
||||||
|
it('returns empty array for tenant with no incidents', async () => {});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Incident Detail (GET /v1/incidents/:id)', () => {
|
||||||
|
it('returns full incident with correlated alerts', async () => {});
|
||||||
|
it('returns 404 for incident belonging to different tenant', async () => {});
|
||||||
|
it('includes timeline of state transitions', async () => {});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Analytics (GET /v1/analytics)', () => {
|
||||||
|
it('returns MTTR for last 7/30/90 days', async () => {});
|
||||||
|
it('returns alert volume by source', async () => {});
|
||||||
|
it('returns noise reduction percentage', async () => {});
|
||||||
|
it('scopes all analytics to authenticated tenant', async () => {});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Tenant Isolation', () => {
|
||||||
|
it('tenant A cannot read tenant B incidents via API', async () => {});
|
||||||
|
it('tenant A cannot read tenant B analytics', async () => {});
|
||||||
|
it('all DynamoDB queries include tenantId partition key', async () => {});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Epic 7: Dashboard UI (Playwright)
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
// tests/e2e/ui/dashboard.spec.ts
|
||||||
|
|
||||||
|
test('login redirects to Cognito hosted UI', async ({ page }) => {
|
||||||
|
await page.goto('/dashboard');
|
||||||
|
await expect(page).toHaveURL(/cognito/);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('incident list renders with correct severity badges', async ({ page }) => {
|
||||||
|
await page.goto('/dashboard/incidents');
|
||||||
|
await expect(page.locator('[data-testid="incident-card"]')).toHaveCount(5);
|
||||||
|
await expect(page.locator('.severity-critical')).toBeVisible();
|
||||||
|
});
|
||||||
|
|
||||||
|
test('incident detail shows correlated alert timeline', async ({ page }) => {
|
||||||
|
await page.goto('/dashboard/incidents/inc-123');
|
||||||
|
await expect(page.locator('[data-testid="alert-timeline"]')).toBeVisible();
|
||||||
|
await expect(page.locator('.timeline-event')).toHaveCountGreaterThan(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('MTTR chart renders with real data', async ({ page }) => {
|
||||||
|
await page.goto('/dashboard/analytics');
|
||||||
|
await expect(page.locator('[data-testid="mttr-chart"]')).toBeVisible();
|
||||||
|
});
|
||||||
|
|
||||||
|
test('noise reduction percentage displays correctly', async ({ page }) => {
|
||||||
|
await page.goto('/dashboard/analytics');
|
||||||
|
const noise = page.locator('[data-testid="noise-reduction"]');
|
||||||
|
await expect(noise).toContainText('%');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('webhook setup wizard generates correct URL', async ({ page }) => {
|
||||||
|
await page.goto('/dashboard/settings/integrations');
|
||||||
|
await page.click('[data-testid="add-datadog"]');
|
||||||
|
const url = await page.locator('[data-testid="webhook-url"]').textContent();
|
||||||
|
expect(url).toMatch(/\/v1\/webhooks\/ingest\/.+/);
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Epic 9: Onboarding & PLG
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
describe('Free Tier Enforcement', () => {
|
||||||
|
it('allows up to 10,000 alerts/month on free tier', async () => {});
|
||||||
|
it('returns 429 with upgrade prompt at 10,001st alert', async () => {});
|
||||||
|
it('resets counter on first of each month', async () => {});
|
||||||
|
it('purges alert data older than 7 days on free tier', async () => {});
|
||||||
|
it('retains alert data for 90 days on pro tier', async () => {});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('OAuth Signup', () => {
|
||||||
|
it('creates tenant record on first Cognito login', async () => {});
|
||||||
|
it('assigns free tier by default', async () => {});
|
||||||
|
it('generates unique webhook URL per tenant', async () => {});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Stripe Integration', () => {
|
||||||
|
it('creates checkout session with correct pricing', async () => {});
|
||||||
|
it('upgrades tenant on checkout.session.completed webhook', async () => {});
|
||||||
|
it('downgrades tenant on subscription.deleted webhook', async () => {});
|
||||||
|
it('validates Stripe webhook signature', async () => {});
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Epic 5.3: Slack Feedback Endpoint
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
describe('Slack Interactive Actions Endpoint', () => {
|
||||||
|
it('validates Slack request signature (HMAC-SHA256)', async () => {});
|
||||||
|
it('rejects request with invalid signature', async () => {});
|
||||||
|
it('handles "helpful" feedback — updates incident quality score', async () => {});
|
||||||
|
it('handles "noise" feedback — adds to suppression training data', async () => {});
|
||||||
|
it('handles "escalate" action — triggers PagerDuty/OpsGenie', async () => {});
|
||||||
|
it('updates original Slack message after action', async () => {});
|
||||||
|
it('scopes action to correct tenant', async () => {});
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Epic 1.4: S3 Raw Payload Archival
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
describe('Raw Payload Archival', () => {
|
||||||
|
it('saves raw webhook payload to S3 asynchronously', async () => {});
|
||||||
|
it('S3 key includes tenantId, source, and timestamp', async () => {});
|
||||||
|
it('archival failure does not block alert processing', async () => {});
|
||||||
|
it('archived payload is retrievable for replay', async () => {});
|
||||||
|
it('S3 lifecycle policy deletes after retention period', async () => {});
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.2 Anti-Pattern Fixes
|
||||||
|
|
||||||
|
#### Replace ioredis-mock with WindowStore Interface
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
// BEFORE (anti-pattern):
|
||||||
|
// import RedisMock from 'ioredis-mock';
|
||||||
|
// const engine = new CorrelationEngine(new RedisMock());
|
||||||
|
|
||||||
|
// AFTER (correct):
|
||||||
|
interface WindowStore {
|
||||||
|
addEvent(tenantId: string, key: string, event: Alert, ttlMs: number): Promise<void>;
|
||||||
|
getWindow(tenantId: string, key: string): Promise<Alert[]>;
|
||||||
|
clearWindow(tenantId: string, key: string): Promise<void>;
|
||||||
|
}
|
||||||
|
|
||||||
|
class InMemoryWindowStore implements WindowStore {
|
||||||
|
private store = new Map<string, { events: Alert[]; expiresAt: number }>();
|
||||||
|
|
||||||
|
async addEvent(tenantId: string, key: string, event: Alert, ttlMs: number) {
|
||||||
|
const fullKey = `${tenantId}:${key}`;
|
||||||
|
const existing = this.store.get(fullKey) || { events: [], expiresAt: Date.now() + ttlMs };
|
||||||
|
existing.events.push(event);
|
||||||
|
this.store.set(fullKey, existing);
|
||||||
|
}
|
||||||
|
|
||||||
|
async getWindow(tenantId: string, key: string): Promise<Alert[]> {
|
||||||
|
const fullKey = `${tenantId}:${key}`;
|
||||||
|
const entry = this.store.get(fullKey);
|
||||||
|
if (!entry || entry.expiresAt < Date.now()) return [];
|
||||||
|
return entry.events;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Unit tests use InMemoryWindowStore — no Redis dependency
|
||||||
|
// Integration tests use RedisWindowStore with Testcontainers
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Replace sinon.useFakeTimers with Clock Interface
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
// BEFORE (anti-pattern):
|
||||||
|
// sinon.useFakeTimers(new Date('2026-03-01T00:00:00Z'));
|
||||||
|
|
||||||
|
// AFTER (correct):
|
||||||
|
interface Clock {
|
||||||
|
now(): number;
|
||||||
|
advanceBy(ms: number): void;
|
||||||
|
}
|
||||||
|
|
||||||
|
class FakeClock implements Clock {
|
||||||
|
private current: number;
|
||||||
|
constructor(start: Date = new Date()) { this.current = start.getTime(); }
|
||||||
|
now() { return this.current; }
|
||||||
|
advanceBy(ms: number) { this.current += ms; }
|
||||||
|
}
|
||||||
|
|
||||||
|
class SystemClock implements Clock {
|
||||||
|
now() { return Date.now(); }
|
||||||
|
advanceBy() { throw new Error('Cannot advance system clock'); }
|
||||||
|
}
|
||||||
|
|
||||||
|
// Inject into CorrelationEngine:
|
||||||
|
const engine = new CorrelationEngine(new InMemoryWindowStore(), new FakeClock());
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.3 Trace Context Propagation Tests
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
describe('Trace Context Propagation', () => {
|
||||||
|
it('API Gateway passes trace_id to Lambda via X-Amzn-Trace-Id', async () => {});
|
||||||
|
|
||||||
|
it('Lambda propagates trace_id into SQS message attributes', async () => {
|
||||||
|
// Verify SQS message has MessageAttribute 'traceparent' with W3C format
|
||||||
|
const msg = await getLastSQSMessage(localstack, 'alert-queue');
|
||||||
|
expect(msg.MessageAttributes.traceparent).toBeDefined();
|
||||||
|
expect(msg.MessageAttributes.traceparent.StringValue).toMatch(
|
||||||
|
/^00-[0-9a-f]{32}-[0-9a-f]{16}-0[01]$/
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('ECS Correlation Engine extracts trace_id from SQS message', async () => {
|
||||||
|
// Verify the correlation span has the correct parent from SQS
|
||||||
|
const spans = inMemoryExporter.getFinishedSpans();
|
||||||
|
const correlationSpan = spans.find(s => s.name === 'alert.correlation');
|
||||||
|
const ingestSpan = spans.find(s => s.name === 'webhook.ingest');
|
||||||
|
expect(correlationSpan.parentSpanId).toBeDefined();
|
||||||
|
// Parent chain must trace back to the original ingest span
|
||||||
|
});
|
||||||
|
|
||||||
|
it('end-to-end trace spans webhook → SQS → correlation → notification', async () => {
|
||||||
|
// Fire a webhook, wait for Slack notification, verify all spans share trace_id
|
||||||
|
const traceId = await fireWebhookAndGetTraceId();
|
||||||
|
const spans = await getSpansByTraceId(traceId);
|
||||||
|
const spanNames = spans.map(s => s.name);
|
||||||
|
expect(spanNames).toContain('webhook.ingest');
|
||||||
|
expect(spanNames).toContain('alert.normalize');
|
||||||
|
expect(spanNames).toContain('alert.correlation');
|
||||||
|
expect(spanNames).toContain('notification.slack');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.4 HMAC Security Hardening
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
describe('HMAC Signature Validation (Hardened)', () => {
|
||||||
|
it('uses crypto.timingSafeEqual, not === comparison', () => {
|
||||||
|
// Inspect the source to verify timing-safe comparison
|
||||||
|
const source = fs.readFileSync('src/ingestion/hmac.ts', 'utf8');
|
||||||
|
expect(source).toContain('timingSafeEqual');
|
||||||
|
expect(source).not.toMatch(/signature\s*===\s*/);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('handles case-insensitive header names (dd-webhook-signature vs DD-WEBHOOK-SIGNATURE)', async () => {
|
||||||
|
const payload = makeAlertPayload('datadog');
|
||||||
|
const sig = computeHMAC(payload, DATADOG_SECRET);
|
||||||
|
|
||||||
|
// Lowercase header
|
||||||
|
const resp1 = await ingest(payload, { 'dd-webhook-signature': sig });
|
||||||
|
expect(resp1.status).toBe(200);
|
||||||
|
|
||||||
|
// Uppercase header
|
||||||
|
const resp2 = await ingest(payload, { 'DD-WEBHOOK-SIGNATURE': sig });
|
||||||
|
expect(resp2.status).toBe(200);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('rejects completely missing signature header', async () => {
|
||||||
|
const resp = await ingest(makeAlertPayload('datadog'), {});
|
||||||
|
expect(resp.status).toBe(401);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('rejects empty signature header', async () => {
|
||||||
|
const resp = await ingest(makeAlertPayload('datadog'), { 'dd-webhook-signature': '' });
|
||||||
|
expect(resp.status).toBe(401);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.5 SQS 256KB Payload Limit
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
describe('Large Payload Handling', () => {
|
||||||
|
it('compresses payloads >200KB before sending to SQS', async () => {
|
||||||
|
const largePayload = makeLargeAlertPayload(300 * 1024); // 300KB
|
||||||
|
const resp = await ingest(largePayload);
|
||||||
|
expect(resp.status).toBe(200);
|
||||||
|
|
||||||
|
const msg = await getLastSQSMessage(localstack, 'alert-queue');
|
||||||
|
// Payload must be compressed or use S3 pointer
|
||||||
|
expect(msg.Body.length).toBeLessThan(256 * 1024);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('uses S3 pointer for payloads >256KB after compression', async () => {
|
||||||
|
const hugePayload = makeLargeAlertPayload(500 * 1024); // 500KB
|
||||||
|
const resp = await ingest(hugePayload);
|
||||||
|
expect(resp.status).toBe(200);
|
||||||
|
|
||||||
|
const msg = await getLastSQSMessage(localstack, 'alert-queue');
|
||||||
|
const body = JSON.parse(msg.Body);
|
||||||
|
expect(body.s3Pointer).toBeDefined();
|
||||||
|
expect(body.s3Pointer).toMatch(/^s3:\/\/dd0c-alert-overflow\//);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('strips unnecessary fields from Datadog payload before SQS', async () => {
|
||||||
|
const payload = makeDatadogPayloadWithLargeTags(100); // 100 tags
|
||||||
|
const resp = await ingest(payload);
|
||||||
|
expect(resp.status).toBe(200);
|
||||||
|
|
||||||
|
const msg = await getLastSQSMessage(localstack, 'alert-queue');
|
||||||
|
const normalized = JSON.parse(msg.Body);
|
||||||
|
// Only essential fields should remain
|
||||||
|
expect(normalized.tags.length).toBeLessThanOrEqual(20);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('rejects payloads >2MB at API Gateway level', async () => {
|
||||||
|
const massive = makeLargeAlertPayload(3 * 1024 * 1024);
|
||||||
|
const resp = await ingest(massive);
|
||||||
|
expect(resp.status).toBe(413);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.6 DLQ Backpressure & Replay
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
describe('DLQ Replay with Backpressure', () => {
|
||||||
|
it('replays DLQ messages in batches of 100', async () => {
|
||||||
|
await seedDLQ(10000); // 10K messages
|
||||||
|
const replayer = new DLQReplayer({ batchSize: 100, delayBetweenBatchesMs: 500 });
|
||||||
|
await replayer.start();
|
||||||
|
|
||||||
|
// Verify batched processing
|
||||||
|
expect(replayer.batchesProcessed).toBeGreaterThan(0);
|
||||||
|
expect(replayer.maxConcurrentMessages).toBeLessThanOrEqual(100);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('pauses replay if correlation engine error rate exceeds 10%', async () => {
|
||||||
|
await seedDLQ(1000);
|
||||||
|
const replayer = new DLQReplayer({ batchSize: 100, errorThreshold: 0.1 });
|
||||||
|
|
||||||
|
// Simulate correlation engine returning errors
|
||||||
|
mockCorrelationEngine.failRate = 0.15;
|
||||||
|
await replayer.start();
|
||||||
|
|
||||||
|
expect(replayer.state).toBe('paused');
|
||||||
|
expect(replayer.pauseReason).toContain('error rate exceeded');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('does not replay if circuit breaker is currently tripped', async () => {
|
||||||
|
await seedDLQ(100);
|
||||||
|
await tripCircuitBreaker();
|
||||||
|
|
||||||
|
const replayer = new DLQReplayer();
|
||||||
|
await replayer.start();
|
||||||
|
|
||||||
|
expect(replayer.messagesReplayed).toBe(0);
|
||||||
|
expect(replayer.state).toBe('blocked_by_circuit_breaker');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('tracks replay progress for resumability', async () => {
|
||||||
|
await seedDLQ(500);
|
||||||
|
const replayer = new DLQReplayer({ batchSize: 50 });
|
||||||
|
|
||||||
|
// Process 3 batches then stop
|
||||||
|
await replayer.processNBatches(3);
|
||||||
|
expect(replayer.checkpoint).toBe(150);
|
||||||
|
|
||||||
|
// Resume from checkpoint
|
||||||
|
const replayer2 = new DLQReplayer({ resumeFrom: replayer.checkpoint });
|
||||||
|
await replayer2.start();
|
||||||
|
expect(replayer2.startedFrom).toBe(150);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.7 Multi-Tenancy Isolation (DynamoDB)
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
describe('DynamoDB Tenant Isolation', () => {
|
||||||
|
it('all DAO methods require tenantId parameter', () => {
|
||||||
|
// Compile-time check: DAO interface has tenantId as first param
|
||||||
|
const daoSource = fs.readFileSync('src/data/incident-dao.ts', 'utf8');
|
||||||
|
const methods = extractPublicMethods(daoSource);
|
||||||
|
for (const method of methods) {
|
||||||
|
expect(method.params[0].name).toBe('tenantId');
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
it('query for tenant A returns zero results for tenant B data', async () => {
|
||||||
|
const dao = new IncidentDAO(dynamoClient);
|
||||||
|
await dao.create('tenant-A', makeIncident());
|
||||||
|
await dao.create('tenant-B', makeIncident());
|
||||||
|
|
||||||
|
const results = await dao.list('tenant-A');
|
||||||
|
expect(results.every(r => r.tenantId === 'tenant-A')).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('partition key always includes tenantId prefix', async () => {
|
||||||
|
const dao = new IncidentDAO(dynamoClient);
|
||||||
|
await dao.create('tenant-X', makeIncident());
|
||||||
|
|
||||||
|
// Read raw DynamoDB item
|
||||||
|
const item = await dynamoClient.scan({ TableName: 'dd0c-alert-main' });
|
||||||
|
expect(item.Items[0].PK.S).toStartWith('TENANT#tenant-X');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.8 Slack Circuit Breaker
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
describe('Slack Notification Circuit Breaker', () => {
|
||||||
|
it('opens circuit after 10 consecutive 429s from Slack', async () => {
|
||||||
|
const slackClient = new SlackClient({ circuitBreakerThreshold: 10 });
|
||||||
|
for (let i = 0; i < 10; i++) {
|
||||||
|
mockSlack.respondWith(429);
|
||||||
|
await slackClient.send(makeMessage()).catch(() => {});
|
||||||
|
}
|
||||||
|
expect(slackClient.circuitState).toBe('open');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('queues notifications while circuit is open', async () => {
|
||||||
|
slackClient.openCircuit();
|
||||||
|
await slackClient.send(makeMessage());
|
||||||
|
expect(slackClient.queuedMessages).toBe(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('half-opens circuit after 60 seconds', async () => {
|
||||||
|
slackClient.openCircuit();
|
||||||
|
clock.advanceBy(61000);
|
||||||
|
expect(slackClient.circuitState).toBe('half-open');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('drains queue on successful half-open probe', async () => {
|
||||||
|
slackClient.openCircuit();
|
||||||
|
slackClient.queue(makeMessage());
|
||||||
|
slackClient.queue(makeMessage());
|
||||||
|
clock.advanceBy(61000);
|
||||||
|
mockSlack.respondWith(200);
|
||||||
|
await slackClient.probe();
|
||||||
|
expect(slackClient.circuitState).toBe('closed');
|
||||||
|
expect(slackClient.queuedMessages).toBe(0);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.9 Updated Test Pyramid (Post-Review)
|
||||||
|
|
||||||
|
| Level | Original | Revised | Rationale |
|
||||||
|
|-------|----------|---------|-----------|
|
||||||
|
| Unit | 70% (~140) | 65% (~180) | More tests total, but integration share grows |
|
||||||
|
| Integration | 20% (~40) | 25% (~70) | Dashboard API, tenant isolation, trace propagation |
|
||||||
|
| E2E | 10% (~20) | 10% (~28) | Dashboard UI (Playwright), onboarding flow |
|
||||||
|
|
||||||
|
*End of P3 Review Remediation Addendum*
|
||||||
|
|||||||
@@ -1107,3 +1107,161 @@ Phase 7: E2E Validation
|
|||||||
---
|
---
|
||||||
|
|
||||||
*End of dd0c/portal Test Architecture*
|
*End of dd0c/portal Test Architecture*
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 11. Review Remediation Addendum (Post-Gemini Review)
|
||||||
|
|
||||||
|
### 11.1 Resolve Database Misalignment (PostgreSQL vs DynamoDB)
|
||||||
|
|
||||||
|
Epic 10.2 specified DynamoDB Single-Table, but the Architecture and Test Architecture are fundamentally built around PostgreSQL (Aurora Serverless v2) with pgvector.
|
||||||
|
**Resolution:** The IDP requires relational joins and vector search. PostgreSQL is the definitive catalog database. DynamoDB references are removed.
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// tests/schema/migration_validation_test.rs
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn elastic_schema_postgres_migration_is_additive_only() {
|
||||||
|
let migrations = read_sql_migrations("./migrations");
|
||||||
|
for migration in migrations {
|
||||||
|
assert!(!migration.contains("DROP COLUMN"), "Destructive schema change detected");
|
||||||
|
assert!(!migration.contains("ALTER COLUMN"), "Type modification detected");
|
||||||
|
assert!(!migration.contains("RENAME COLUMN"), "Column rename detected");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn migration_does_not_hold_exclusive_locks_on_reads() {
|
||||||
|
// Concurrent index creation tests
|
||||||
|
assert!(migration_contains("CREATE INDEX CONCURRENTLY"),
|
||||||
|
"Indexes must be created concurrently to avoid locking the catalog");
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.2 Invert the Test Pyramid (Integration Honeycomb)
|
||||||
|
|
||||||
|
Shift from 70% Unit (with heavy moto/responses mocking) to 30/60/10 with VCR and LocalStack.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# tests/integration/scanners/test_aws_scanner.py
|
||||||
|
|
||||||
|
@pytest.mark.vcr()
|
||||||
|
def test_aws_scanner_discovers_ecs_services_and_api_gateways(vcr_cassette):
|
||||||
|
# Uses real recorded AWS API responses, not moto mocks
|
||||||
|
# Validates actual boto3 parsing against real-world AWS shapes
|
||||||
|
scanner = AWSDiscoveryScanner(account_id="123456789012", region="us-east-1")
|
||||||
|
services = scanner.scan()
|
||||||
|
assert len(services) > 0
|
||||||
|
assert any(s.type == "ecs_service" for s in services)
|
||||||
|
|
||||||
|
@pytest.mark.vcr()
|
||||||
|
def test_github_scanner_handles_graphql_pagination(vcr_cassette):
|
||||||
|
# Validates real GitHub GraphQL paginated responses
|
||||||
|
scanner = GitHubDiscoveryScanner(org_name="dd0c")
|
||||||
|
repos = scanner.scan()
|
||||||
|
assert len(repos) > 100 # Proves pagination logic works
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.3 Missing Epic Coverage
|
||||||
|
|
||||||
|
#### Epic 3.4: PagerDuty & OpsGenie Integrations
|
||||||
|
```python
|
||||||
|
# tests/integration/test_pagerduty_sync.py
|
||||||
|
|
||||||
|
@pytest.mark.vcr()
|
||||||
|
def test_pagerduty_sync_maps_schedules_to_catalog_teams():
|
||||||
|
sync = PagerDutySyncer(api_key="sk-test-key")
|
||||||
|
teams = sync.fetch_oncall_schedules()
|
||||||
|
assert teams[0].oncall_email is not None
|
||||||
|
|
||||||
|
def test_pagerduty_credentials_are_encrypted_at_rest():
|
||||||
|
# Verify KMS envelope encryption for 3rd party API keys
|
||||||
|
pass
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Epic 4.3: Redis Prefix Caching for Cmd+K
|
||||||
|
```python
|
||||||
|
# tests/integration/test_search_cache.py
|
||||||
|
|
||||||
|
def test_cmd_k_search_hits_redis_cache_before_postgres():
|
||||||
|
redis_client.set("search:auth", json.dumps([{"name": "auth-service"}]))
|
||||||
|
# Must return < 5ms from Redis, skipping DB
|
||||||
|
result = search_api.query("auth")
|
||||||
|
assert result[0]['name'] == "auth-service"
|
||||||
|
|
||||||
|
def test_catalog_update_invalidates_search_cache():
|
||||||
|
# Create new service
|
||||||
|
catalog_api.create_service("billing-api")
|
||||||
|
# Prefix cache must be purged
|
||||||
|
assert redis_client.keys("search:*") == []
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Epics 5 & 6: UI and Dashboards (Playwright)
|
||||||
|
```typescript
|
||||||
|
// tests/e2e/ui/catalog.spec.ts
|
||||||
|
|
||||||
|
test('service catalog renders progressive disclosure UI', async ({ page }) => {
|
||||||
|
await page.goto('/catalog');
|
||||||
|
// Click expands details instead of navigating away
|
||||||
|
await page.click('[data-testid="service-row-auth-api"]');
|
||||||
|
await expect(page.locator('[data-testid="service-drawer"]')).toBeVisible();
|
||||||
|
});
|
||||||
|
|
||||||
|
test('dashboard KPI aggregation shows total services and ownership coverage', async ({ page }) => {
|
||||||
|
await page.goto('/dashboard');
|
||||||
|
await expect(page.locator('[data-testid="kpi-total-services"]')).toHaveText("150");
|
||||||
|
await expect(page.locator('[data-testid="kpi-ownership"]')).toHaveText("85%");
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Epic 9: Onboarding & Stripe
|
||||||
|
```python
|
||||||
|
# tests/integration/test_stripe_webhooks.py
|
||||||
|
|
||||||
|
def test_stripe_checkout_completed_upgrades_tenant_tier():
|
||||||
|
payload = load_fixture("stripe_checkout_completed.json")
|
||||||
|
signature = generate_stripe_signature(payload, secret)
|
||||||
|
|
||||||
|
response = api_client.post("/webhooks/stripe", data=payload, headers={"Stripe-Signature": signature})
|
||||||
|
assert response.status_code == 200
|
||||||
|
|
||||||
|
tenant = db.get_tenant("t-123")
|
||||||
|
assert tenant.tier == "pro"
|
||||||
|
|
||||||
|
def test_websocket_streams_discovery_progress_during_onboarding():
|
||||||
|
# Connect WS client, trigger discovery, assert WS receives "discovering AWS...", "found 50 resources..."
|
||||||
|
pass
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.4 Scaled Performance Benchmarks
|
||||||
|
```python
|
||||||
|
# tests/performance/test_discovery_scale.py
|
||||||
|
|
||||||
|
def test_discovery_pipeline_handles_10000_aws_resources_without_step_functions_payload_limit():
|
||||||
|
# Simulate an AWS environment with 10k resources
|
||||||
|
# Must chunk state machine transitions to stay under 256KB Step Functions limit
|
||||||
|
pass
|
||||||
|
|
||||||
|
def test_discovery_pipeline_handles_1000_github_repos():
|
||||||
|
# Verify GraphQL batching and rate limit backoff
|
||||||
|
pass
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.5 Edge Case Resilience
|
||||||
|
```python
|
||||||
|
def test_github_graphql_concurrent_rate_limiting():
|
||||||
|
# If 5 tenants scan concurrently, respect Retry-After headers across workers
|
||||||
|
pass
|
||||||
|
|
||||||
|
def test_partial_discovery_scan_does_not_corrupt_catalog():
|
||||||
|
# If GitHub scan times out halfway, existing services must NOT be marked stale
|
||||||
|
pass
|
||||||
|
|
||||||
|
def test_ownership_conflict_resolution():
|
||||||
|
# If two discovery sources claim the same repo, prioritize Explicit (Config) over Implicit (Tags)
|
||||||
|
pass
|
||||||
|
|
||||||
|
def test_meilisearch_index_rebuild_does_not_drop_search():
|
||||||
|
# Verify zero-downtime index swapping during mapping updates
|
||||||
|
pass
|
||||||
|
```
|
||||||
|
|||||||
@@ -1,8 +1,8 @@
|
|||||||
# dd0c/cost — Test Architecture & TDD Strategy
|
# dd0c/cost — Test Architecture & TDD Strategy
|
||||||
|
|
||||||
**Product:** dd0c/cost — AWS Cost Anomaly Detective
|
**Product:** dd0c/cost — AWS Cost Anomaly Detective
|
||||||
**Author:** Test Architecture Phase
|
**Author:** Test Architecture Phase (v2 — Post-Review Rewrite)
|
||||||
**Date:** February 28, 2026
|
**Date:** March 1, 2026
|
||||||
**Status:** V1 MVP — Solo Founder Scope
|
**Status:** V1 MVP — Solo Founder Scope
|
||||||
|
|
||||||
---
|
---
|
||||||
@@ -13,7 +13,9 @@
|
|||||||
|
|
||||||
dd0c/cost sits at the intersection of **money and infrastructure**. A false negative means a customer loses thousands of dollars. A false positive means alert fatigue and churn. The test suite's primary job is to mathematically prove the anomaly scoring engine works across edge cases.
|
dd0c/cost sits at the intersection of **money and infrastructure**. A false negative means a customer loses thousands of dollars. A false positive means alert fatigue and churn. The test suite's primary job is to mathematically prove the anomaly scoring engine works across edge cases.
|
||||||
|
|
||||||
Guiding principle: **Test the math first, test the infrastructure second.** The Z-score and novelty algorithms must be exhaustively unit-tested with synthetic data before any AWS APIs are mocked.
|
Guiding principle: **Test the math first, test the infrastructure second.** The Z-score and novelty algorithms must be exhaustively tested with property-based testing before any AWS APIs are mocked.
|
||||||
|
|
||||||
|
Second principle: **Every dollar matters.** Cost calculations involve floating-point arithmetic on money. Rounding errors, precision loss, and currency handling must be tested with the same rigor as a financial system.
|
||||||
|
|
||||||
### 1.2 Red-Green-Refactor Adapted to dd0c/cost
|
### 1.2 Red-Green-Refactor Adapted to dd0c/cost
|
||||||
|
|
||||||
@@ -28,33 +30,52 @@ REFACTOR → Optimize the baseline lookup, extract novelty checks,
|
|||||||
```
|
```
|
||||||
|
|
||||||
**When to write tests first (strict TDD):**
|
**When to write tests first (strict TDD):**
|
||||||
- Anomaly scoring engine (Z-scores, novelty checks, composite severity)
|
- All anomaly scoring (Z-scores, novelty checks, composite severity)
|
||||||
- Cold-start heuristics (fast-path for >$5/hr resources)
|
- All cold-start heuristics (fast-path for >$5/hr resources)
|
||||||
- Baseline calculation (moving averages, standard deviation)
|
- All baseline calculation (Welford algorithm, maturity transitions)
|
||||||
- Governance policy (strict vs. audit mode, 14-day promotion)
|
- All governance policy (strict vs. audit mode, 14-day auto-promotion, panic mode)
|
||||||
|
- All Slack signature validation (security-critical)
|
||||||
|
- All cost calculations (pricing lookup, hourly cost estimation)
|
||||||
|
- All feature flag circuit breakers
|
||||||
|
|
||||||
**When integration tests lead:**
|
**When integration tests lead:**
|
||||||
- CloudTrail ingestion (implement against LocalStack EventBridge, then lock in)
|
- CloudTrail ingestion (implement against LocalStack EventBridge, then lock in)
|
||||||
- DynamoDB Single-Table schema (build access patterns, then integration test)
|
- DynamoDB Single-Table schema (build access patterns, then integration test)
|
||||||
|
- Cross-account STS role assumption (test against LocalStack)
|
||||||
|
|
||||||
**When E2E tests lead:**
|
**When E2E tests lead:**
|
||||||
- The Slack alert interaction (format block kit, test the "Snooze/Terminate" buttons)
|
- Slack alert interaction (format block kit, test "Snooze/Terminate" buttons)
|
||||||
|
- Onboarding wizard (CloudFormation quick-create → role validation → first alert)
|
||||||
|
|
||||||
### 1.3 Test Naming Conventions
|
### 1.3 Test Naming Conventions
|
||||||
|
|
||||||
```typescript
|
```typescript
|
||||||
|
// Unit tests
|
||||||
describe('AnomalyScorer', () => {
|
describe('AnomalyScorer', () => {
|
||||||
it('assigns critical severity when Z-score > 3 and hourly cost > $1', () => {});
|
it('assigns critical severity when Z-score exceeds 3 and hourly cost exceeds $1', () => {});
|
||||||
it('flags actor novelty when IAM role has never launched this service', () => {});
|
it('flags actor novelty when IAM role has never launched this service type', () => {});
|
||||||
it('bypasses baseline and triggers fast-path critical for $10/hr instance', () => {});
|
it('bypasses baseline and triggers fast-path critical for $10/hr instance', () => {});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('CloudTrailNormalizer', () => {
|
describe('BaselineCalculator', () => {
|
||||||
it('extracts instance type and region from RunInstances event', () => {});
|
it('updates running mean using Welford online algorithm', () => {});
|
||||||
it('looks up correct on-demand pricing for us-east-1 r6g.xlarge', () => {});
|
it('handles zero standard deviation without division by zero', () => {});
|
||||||
|
});
|
||||||
|
|
||||||
|
// Property-based tests
|
||||||
|
describe('AnomalyScorer (property-based)', () => {
|
||||||
|
it('always returns severity between 0 and 100 for any valid input', () => {});
|
||||||
|
it('monotonically increases score as Z-score increases', () => {});
|
||||||
|
it('never assigns critical to events below $0.50/hr regardless of Z-score', () => {});
|
||||||
});
|
});
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**Rules:**
|
||||||
|
- Describe the observable outcome, not the implementation
|
||||||
|
- Use present tense
|
||||||
|
- If you need "and" in the name, split into two tests
|
||||||
|
- Property-based tests explicitly state the invariant
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Section 2: Test Pyramid
|
## Section 2: Test Pyramid
|
||||||
@@ -63,93 +84,441 @@ describe('CloudTrailNormalizer', () => {
|
|||||||
|
|
||||||
| Level | Target | Count (V1) | Runtime |
|
| Level | Target | Count (V1) | Runtime |
|
||||||
|-------|--------|------------|---------|
|
|-------|--------|------------|---------|
|
||||||
| Unit | 70% | ~250 tests | <20s |
|
| Unit | 80% | ~350 tests | <25s |
|
||||||
| Integration | 20% | ~80 tests | <3min |
|
| Integration | 15% | ~65 tests | <4min |
|
||||||
| E2E/Smoke | 10% | ~15 tests | <5min |
|
| E2E/Smoke | 5% | ~15 tests | <8min |
|
||||||
|
|
||||||
|
Higher unit ratio than other dd0c products because the core value is pure math (scoring, baselines, Z-scores).
|
||||||
|
|
||||||
### 2.2 Unit Test Targets
|
### 2.2 Unit Test Targets
|
||||||
|
|
||||||
| Component | Key Behaviors | Est. Tests |
|
| Component | Key Behaviors | Est. Tests |
|
||||||
|-----------|--------------|------------|
|
|-----------|--------------|------------|
|
||||||
| Event Normalizer | CloudTrail parsing, pricing lookup, deduplication | 40 |
|
| CloudTrail Normalizer | Event parsing, pricing lookup, dedup, field extraction | 40 |
|
||||||
| Baseline Engine | Running mean/stddev calculation, maturity checks | 35 |
|
| Baseline Engine | Welford algorithm, maturity transitions, feedback loop | 45 |
|
||||||
| Anomaly Scorer | Z-score math, novelty detection, composite scoring | 50 |
|
| Anomaly Scorer | Z-score, novelty, composite scoring, cold-start fast-path | 60 |
|
||||||
| Remediation Handler | Stop/Terminate payload parsing, IAM role assumption logic | 20 |
|
| Zombie Hunter | Idle resource detection, cost estimation, age calculation | 25 |
|
||||||
| Notification Engine | Slack formatting, daily digest aggregation | 30 |
|
| Notification Formatter | Slack Block Kit, daily digest, CLI command generation | 30 |
|
||||||
| Governance Policy | Mode enforcement, 14-day auto-promotion | 25 |
|
| Slack Bot | Command parsing, signature validation, action handling | 25 |
|
||||||
| Feature Flags | Circuit breaker on alert volume, flag metadata | 15 |
|
| Remediation Handler | Stop/Terminate logic, IAM role assumption, snooze/dismiss | 20 |
|
||||||
|
| Dashboard API | CRUD, tenant isolation, pagination, filtering | 25 |
|
||||||
|
| Governance Policy | Mode enforcement, 14-day promotion, panic mode | 30 |
|
||||||
|
| Feature Flags | Circuit breaker, flag lifecycle, local evaluation | 15 |
|
||||||
|
| Onboarding | CFN template validation, role validation, free tier enforcement | 20 |
|
||||||
|
| Cost Calculations | Pricing precision, rounding, fallback pricing, currency | 15 |
|
||||||
|
|
||||||
|
### 2.3 Integration Test Boundaries
|
||||||
|
|
||||||
|
| Boundary | What's Tested | Infrastructure |
|
||||||
|
|----------|--------------|----------------|
|
||||||
|
| EventBridge → SQS FIFO | Cross-account event routing, dedup, ordering | LocalStack |
|
||||||
|
| SQS → Event Processor Lambda | Batch processing, error handling, DLQ routing | LocalStack |
|
||||||
|
| Event Processor → DynamoDB | CostEvent writes, baseline updates, transactions | Testcontainers DynamoDB Local |
|
||||||
|
| Anomaly Scorer → DynamoDB | Baseline reads, anomaly record writes | Testcontainers DynamoDB Local |
|
||||||
|
| Notifier → Slack API | Block Kit delivery, rate limiting, message updates | WireMock |
|
||||||
|
| API Gateway → Lambda | Auth (Cognito JWT), routing, throttling | LocalStack |
|
||||||
|
| STS → Customer Account | Cross-account role assumption, ExternalId validation | LocalStack |
|
||||||
|
| CDK Synth | Infrastructure snapshot, resource policy validation | CDK assertions |
|
||||||
|
|
||||||
|
### 2.4 E2E/Smoke Scenarios
|
||||||
|
|
||||||
|
1. **Real-Time Anomaly Detection**: CloudTrail event → scoring → Slack alert (<30s)
|
||||||
|
2. **Interactive Remediation**: Slack button click → StopInstances → message update
|
||||||
|
3. **Onboarding Flow**: Signup → CFN deploy → role validation → first alert
|
||||||
|
4. **14-Day Auto-Promotion**: Simulate 14 days → verify strict→audit transition
|
||||||
|
5. **Zombie Hunter**: Daily scan → detect idle EC2 → Slack digest
|
||||||
|
6. **Panic Mode**: Enable panic → all alerting stops → anomalies still logged
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Section 3: Unit Test Strategy
|
## Section 3: Unit Test Strategy
|
||||||
|
|
||||||
### 3.1 Cost Ingestion & Normalization
|
### 3.1 CloudTrail Normalizer
|
||||||
|
|
||||||
```typescript
|
```typescript
|
||||||
describe('CloudTrailNormalizer', () => {
|
describe('CloudTrailNormalizer', () => {
|
||||||
it('normalizes EC2 RunInstances event to CostEvent schema', () => {});
|
describe('Event Parsing', () => {
|
||||||
it('normalizes RDS CreateDBInstance event to CostEvent schema', () => {});
|
it('normalizes EC2 RunInstances to CostEvent schema', () => {});
|
||||||
it('extracts assumed role ARN as actor instead of base STS role', () => {});
|
it('normalizes RDS CreateDBInstance to CostEvent schema', () => {});
|
||||||
it('applies fallback pricing when instance type is not in static table', () => {});
|
it('normalizes Lambda CreateFunction to CostEvent schema', () => {});
|
||||||
it('ignores non-cost-generating events (e.g., DescribeInstances)', () => {});
|
it('extracts assumed role ARN as actor (not base STS role)', () => {});
|
||||||
|
it('extracts instance type, region, and AZ from event detail', () => {});
|
||||||
|
it('handles batched RunInstances (multiple instances in one call)', () => {});
|
||||||
|
it('ignores non-cost-generating events (DescribeInstances, ListBuckets)', () => {});
|
||||||
|
it('handles malformed CloudTrail JSON without crashing', () => {});
|
||||||
|
it('handles missing optional fields gracefully', () => {});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Pricing Lookup', () => {
|
||||||
|
it('looks up correct on-demand price for us-east-1 m5.xlarge', () => {});
|
||||||
|
it('looks up correct on-demand price for us-west-2 r6g.2xlarge', () => {});
|
||||||
|
it('applies fallback pricing when instance type not in static table', () => {});
|
||||||
|
it('returns $0 for instance types with no pricing data and logs warning', () => {});
|
||||||
|
it('handles GPU instances (p4d, g5) with correct pricing', () => {});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Deduplication', () => {
|
||||||
|
it('generates deterministic fingerprint from eventID', () => {});
|
||||||
|
it('detects duplicate CloudTrail events by eventID', () => {});
|
||||||
|
it('allows same resource type from different events', () => {});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Cost Precision', () => {
|
||||||
|
it('calculates hourly cost with 4 decimal places', () => {});
|
||||||
|
it('rounds consistently (banker rounding) to avoid accumulation errors', () => {});
|
||||||
|
it('handles sub-cent costs for Lambda invocations', () => {});
|
||||||
|
});
|
||||||
});
|
});
|
||||||
```
|
```
|
||||||
|
|
||||||
### 3.2 Anomaly Engine (The Math)
|
### 3.2 Anomaly Scorer
|
||||||
|
|
||||||
|
The most critical component. Uses property-based testing via `fast-check`.
|
||||||
|
|
||||||
```typescript
|
```typescript
|
||||||
describe('AnomalyScorer', () => {
|
describe('AnomalyScorer', () => {
|
||||||
describe('Statistical Scoring (Z-Score)', () => {
|
describe('Z-Score Calculation', () => {
|
||||||
it('returns score=0 when event cost exactly matches baseline mean', () => {});
|
it('returns 0 when event cost exactly matches baseline mean', () => {});
|
||||||
it('returns proportional score for Z-scores between 1.0 and 3.0', () => {});
|
it('returns proportional score for Z-scores between 1.0 and 3.0', () => {});
|
||||||
it('caps Z-score contribution at max threshold', () => {});
|
it('caps Z-score contribution at configurable max threshold', () => {});
|
||||||
|
it('handles zero standard deviation without division by zero', () => {});
|
||||||
|
it('handles single data point baseline (stddev undefined)', () => {});
|
||||||
|
it('handles extremely large values without float overflow', () => {});
|
||||||
|
it('handles negative cost delta (cost decrease) as non-anomalous', () => {});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('Novelty Scoring', () => {
|
describe('Novelty Scoring', () => {
|
||||||
it('adds novelty penalty when instance type is first seen for account', () => {});
|
it('adds instance novelty penalty when type first seen for account', () => {});
|
||||||
it('adds novelty penalty when IAM user has never provisioned this service', () => {});
|
it('adds actor novelty penalty when IAM role is new', () => {});
|
||||||
|
it('does not penalize known instance type + known actor', () => {});
|
||||||
|
it('weights instance novelty higher than actor novelty', () => {});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Composite Scoring', () => {
|
||||||
|
it('combines Z-score + novelty into composite severity', () => {});
|
||||||
|
it('classifies composite < 30 as info', () => {});
|
||||||
|
it('classifies composite 30-60 as warning', () => {});
|
||||||
|
it('classifies composite > 60 as critical', () => {});
|
||||||
|
it('never assigns critical to events below $0.50/hr', () => {});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('Cold-Start Fast Path', () => {
|
describe('Cold-Start Fast Path', () => {
|
||||||
it('flags $5/hr instance as warning when baseline < 14 days', () => {});
|
it('flags $5/hr instance as warning when baseline < 14 days', () => {});
|
||||||
it('flags $25/hr instance as critical immediately, bypassing baseline', () => {});
|
it('flags $25/hr instance as critical immediately, bypassing baseline', () => {});
|
||||||
it('ignores $0.10/hr instances during cold-start learning period', () => {});
|
it('ignores $0.10/hr instances during cold-start learning', () => {});
|
||||||
|
it('fast-path is always on — not behind a feature flag', () => {});
|
||||||
|
it('transitions from fast-path to statistical scoring at maturity', () => {});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Feedback Loop', () => {
|
||||||
|
it('reduces score for resources marked as expected', () => {});
|
||||||
|
it('adds actor to expected list after mark-as-expected', () => {});
|
||||||
|
it('still flags expected actor if cost is 10x above baseline', () => {});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Property-Based Tests (fast-check)', () => {
|
||||||
|
it('score is always between 0 and 100 for any valid input', () => {
|
||||||
|
// fc.assert(fc.property(
|
||||||
|
// fc.record({ cost: fc.float({min: 0}), mean: fc.float({min: 0}), stddev: fc.float({min: 0}) }),
|
||||||
|
// (input) => { const score = scorer.score(input); return score >= 0 && score <= 100; }
|
||||||
|
// ))
|
||||||
|
});
|
||||||
|
it('score monotonically increases as cost increases (baseline fixed)', () => {});
|
||||||
|
it('score monotonically increases as Z-score increases', () => {});
|
||||||
|
it('cold-start fast-path always triggers for cost > $25/hr', () => {});
|
||||||
|
it('mature baseline never uses fast-path thresholds', () => {});
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
```
|
```
|
||||||
|
|
||||||
### 3.3 Baseline Learning
|
### 3.3 Baseline Engine
|
||||||
|
|
||||||
```typescript
|
```typescript
|
||||||
describe('BaselineCalculator', () => {
|
describe('BaselineCalculator', () => {
|
||||||
it('updates running mean and stddev using Welford algorithm', () => {});
|
describe('Welford Online Algorithm', () => {
|
||||||
|
it('updates running mean correctly after each observation', () => {});
|
||||||
|
it('updates running variance correctly after each observation', () => {});
|
||||||
|
it('produces correct stddev after 100 observations', () => {});
|
||||||
|
it('handles first observation (count=1, stddev=0)', () => {});
|
||||||
|
it('handles identical observations (stddev=0)', () => {});
|
||||||
|
it('handles catastrophic cancellation with large values', () => {
|
||||||
|
// Welford is numerically stable — verify this property
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Maturity Transitions', () => {
|
||||||
|
it('starts in cold-start state', () => {});
|
||||||
|
it('transitions to learning after 5 events', () => {});
|
||||||
|
it('transitions to mature after 20 events AND 14 days', () => {});
|
||||||
|
it('does not mature with 100 events but only 3 days', () => {});
|
||||||
|
it('does not mature with 14 days but only 5 events', () => {});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Actor & Instance Tracking', () => {
|
||||||
it('adds new actor to observed_actors set', () => {});
|
it('adds new actor to observed_actors set', () => {});
|
||||||
it('marks baseline as mature when event_count > 20 and age_days > 14', () => {});
|
it('adds new instance type to observed_types set', () => {});
|
||||||
|
it('does not duplicate existing actors', () => {});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Property-Based Tests', () => {
|
||||||
|
it('mean converges to true mean as observations increase', () => {});
|
||||||
|
it('variance is always non-negative', () => {});
|
||||||
|
it('stddev equals sqrt(variance) within float tolerance', () => {});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3.4 Zombie Hunter
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
describe('ZombieHunter', () => {
|
||||||
|
it('detects EC2 instance running >7 days with <5% CPU utilization', () => {});
|
||||||
|
it('detects RDS instance with 0 connections for >3 days', () => {});
|
||||||
|
it('detects unattached EBS volumes older than 7 days', () => {});
|
||||||
|
it('calculates cumulative waste cost for each zombie', () => {});
|
||||||
|
it('excludes instances tagged dd0c:ignore', () => {});
|
||||||
|
it('handles API pagination for accounts with 500+ instances', () => {});
|
||||||
|
it('respects read-only IAM permissions (never modifies resources)', () => {});
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3.5 Notification Formatter
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
describe('NotificationFormatter', () => {
|
||||||
|
describe('Slack Block Kit', () => {
|
||||||
|
it('formats EC2 anomaly with resource type, region, cost, actor', () => {});
|
||||||
|
it('formats RDS anomaly with engine, storage, multi-AZ status', () => {});
|
||||||
|
it('includes "Why this alert" section with anomaly signals', () => {});
|
||||||
|
it('includes suggested CLI commands for remediation', () => {});
|
||||||
|
it('includes Snooze/Mark Expected/Stop Instance buttons', () => {});
|
||||||
|
it('generates correct aws ec2 stop-instances command', () => {});
|
||||||
|
it('generates correct aws rds stop-db-instance command', () => {});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Daily Digest', () => {
|
||||||
|
it('aggregates 24h of anomalies into summary stats', () => {});
|
||||||
|
it('includes total estimated spend across all accounts', () => {});
|
||||||
|
it('highlights top 3 costliest anomalies', () => {});
|
||||||
|
it('includes zombie resource count and waste estimate', () => {});
|
||||||
|
it('shows baseline learning progress for new accounts', () => {});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3.6 Slack Bot
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
describe('SlackBot', () => {
|
||||||
|
describe('Signature Validation', () => {
|
||||||
|
it('validates correct Slack request signature (HMAC-SHA256)', () => {});
|
||||||
|
it('rejects request with invalid signature', () => {});
|
||||||
|
it('rejects request with missing X-Slack-Signature header', () => {});
|
||||||
|
it('rejects request with expired timestamp (>5 min)', () => {});
|
||||||
|
it('uses timing-safe comparison to prevent timing attacks', () => {});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Command Parsing', () => {
|
||||||
|
it('routes /dd0c status to status handler', () => {});
|
||||||
|
it('routes /dd0c anomalies to anomaly list handler', () => {});
|
||||||
|
it('routes /dd0c digest to digest handler', () => {});
|
||||||
|
it('returns help text for unknown commands', () => {});
|
||||||
|
it('responds within 3 seconds or defers with 200 OK', () => {});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Interactive Actions', () => {
|
||||||
|
it('validates interactive payload signature', () => {});
|
||||||
|
it('handles mark_expected action and updates baseline', () => {});
|
||||||
|
it('handles snooze_1h action and sets snoozeUntil', () => {});
|
||||||
|
it('handles snooze_24h action', () => {});
|
||||||
|
it('updates original Slack message after action', () => {});
|
||||||
|
it('rejects action from user not in authorized workspace', () => {});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3.7 Governance Policy Engine
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
describe('GovernancePolicy', () => {
|
||||||
|
describe('Mode Enforcement', () => {
|
||||||
|
it('strict mode: logs anomaly but does not send Slack alert', () => {});
|
||||||
|
it('audit mode: sends Slack alert with full logging', () => {});
|
||||||
|
it('defaults new accounts to strict mode', () => {});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('14-Day Auto-Promotion', () => {
|
||||||
|
it('does not promote account with <14 days of baseline', () => {});
|
||||||
|
it('does not promote account with >10% false-positive rate', () => {});
|
||||||
|
it('promotes account on day 15 if FP rate <10%', () => {});
|
||||||
|
it('calculates false-positive rate from mark-as-expected actions', () => {});
|
||||||
|
it('auto-promotion check runs daily via cron', () => {});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Panic Mode', () => {
|
||||||
|
it('stops all alerting when panic=true', () => {});
|
||||||
|
it('continues scoring and logging during panic', () => {});
|
||||||
|
it('activates in <1 second via Redis key', () => {});
|
||||||
|
it('activatable via POST /admin/panic', () => {});
|
||||||
|
it('dashboard API returns "alerting paused" header during panic', () => {});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Per-Account Override', () => {
|
||||||
|
it('account can set stricter mode than system default', () => {});
|
||||||
|
it('account cannot downgrade from system strict to audit', () => {});
|
||||||
|
it('merge logic: max_restrictive(system, account)', () => {});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Policy Decision Logging', () => {
|
||||||
|
it('logs "suppressed by strict mode" with anomaly context', () => {});
|
||||||
|
it('logs "auto-promoted to audit mode" with baseline stats', () => {});
|
||||||
|
it('logs "panic mode active — alerting paused"', () => {});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3.8 Dashboard API
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
describe('DashboardAPI', () => {
|
||||||
|
describe('Account Management', () => {
|
||||||
|
it('GET /v1/accounts returns connected accounts for tenant', () => {});
|
||||||
|
it('DELETE /v1/accounts/:id marks account as disconnecting', () => {});
|
||||||
|
it('returns 401 without valid Cognito JWT', () => {});
|
||||||
|
it('scopes all queries to authenticated tenantId', () => {});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Anomaly Listing', () => {
|
||||||
|
it('GET /v1/anomalies returns recent anomalies', () => {});
|
||||||
|
it('supports since, status, severity filters', () => {});
|
||||||
|
it('implements cursor-based pagination', () => {});
|
||||||
|
it('includes slackMessageUrl when alert was sent', () => {});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Baseline Overrides', () => {
|
||||||
|
it('PATCH /v1/accounts/:id/baselines/:service/:type updates sensitivity', () => {});
|
||||||
|
it('rejects invalid sensitivity values', () => {});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Tenant Isolation', () => {
|
||||||
|
it('never returns anomalies from another tenant', () => {});
|
||||||
|
it('never returns accounts from another tenant', () => {});
|
||||||
|
it('enforces tenantId on all DynamoDB queries', () => {});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3.9 Onboarding & PLG
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
describe('Onboarding', () => {
|
||||||
|
describe('CloudFormation Template', () => {
|
||||||
|
it('generates valid CFN YAML with correct IAM permissions', () => {});
|
||||||
|
it('includes ExternalId parameter', () => {});
|
||||||
|
it('includes EventBridge rule for cost-relevant CloudTrail events', () => {});
|
||||||
|
it('quick-create URL contains correct template URL and parameters', () => {});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Role Validation', () => {
|
||||||
|
it('successfully assumes role with correct ExternalId', () => {});
|
||||||
|
it('returns clear error on role not found', () => {});
|
||||||
|
it('returns clear error on ExternalId mismatch', () => {});
|
||||||
|
it('triggers zombie scan on successful connection', () => {});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Free Tier Enforcement', () => {
|
||||||
|
it('allows first account connection on free tier', () => {});
|
||||||
|
it('rejects second account with 403 and upgrade prompt', () => {});
|
||||||
|
it('allows multiple accounts on pro tier', () => {});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Stripe Integration', () => {
|
||||||
|
it('creates Stripe Checkout session with correct pricing', () => {});
|
||||||
|
it('handles checkout.session.completed webhook', () => {});
|
||||||
|
it('handles customer.subscription.deleted webhook', () => {});
|
||||||
|
it('validates Stripe webhook signature', () => {});
|
||||||
|
it('updates tenant tier to pro on successful payment', () => {});
|
||||||
|
it('downgrades tenant on subscription cancellation', () => {});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3.10 Feature Flag Circuit Breaker
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
describe('AlertVolumeCircuitBreaker', () => {
|
||||||
|
it('allows alerting when volume is within 3x baseline', () => {});
|
||||||
|
it('trips breaker when alerts exceed 3x baseline over 1 hour', () => {});
|
||||||
|
it('auto-disables the scoring flag when breaker trips', () => {});
|
||||||
|
it('buffers suppressed alerts in DLQ for review', () => {});
|
||||||
|
it('tracks alert-per-account rate in Redis sliding window', () => {});
|
||||||
|
it('resets breaker after manual flag re-enable', () => {});
|
||||||
|
it('fast-path alerts are exempt from circuit breaker', () => {});
|
||||||
});
|
});
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Section 4: Integration Test Strategy
|
## Section 4: Integration Test Strategy
|
||||||
|
|
||||||
### 4.1 DynamoDB Data Layer (Testcontainers)
|
### 4.1 DynamoDB Data Layer (Testcontainers)
|
||||||
|
|
||||||
```typescript
|
```typescript
|
||||||
describe('DynamoDB Single-Table Patterns', () => {
|
describe('DynamoDB Integrations', () => {
|
||||||
it('writes CostEvent and updates Baseline in single transaction', async () => {});
|
let dynamodb: StartedTestContainer;
|
||||||
it('queries all anomalies for tenant within time range', async () => {});
|
|
||||||
|
beforeAll(async () => {
|
||||||
|
dynamodb = await new GenericContainer('amazon/dynamodb-local:latest')
|
||||||
|
.withExposedPorts(8000).start();
|
||||||
|
// Create dd0c-cost-main table with GSIs
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Transactional Writes', () => {
|
||||||
|
it('writes CostEvent and updates Baseline in single TransactWriteItem', async () => {});
|
||||||
|
it('fails gracefully if TransactWriteItem encounters ConditionalCheckFailed', async () => {});
|
||||||
|
it('handles partial failure recovery when Baseline update conflicts', async () => {});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Access Patterns', () => {
|
||||||
|
it('queries all anomalies for tenant within time range (GSI3)', async () => {});
|
||||||
it('fetches tenant config and Slack tokens securely', async () => {});
|
it('fetches tenant config and Slack tokens securely', async () => {});
|
||||||
|
it('retrieves accurate Baseline snapshot by resource type', async () => {});
|
||||||
|
});
|
||||||
});
|
});
|
||||||
```
|
```
|
||||||
|
|
||||||
### 4.2 AWS API Contract Tests
|
### 4.2 Cross-Account STS & AWS APIs (LocalStack)
|
||||||
|
|
||||||
```typescript
|
```typescript
|
||||||
describe('AWS Cross-Account Actions', () => {
|
describe('AWS Cross-Account Integrations', () => {
|
||||||
// Uses LocalStack to simulate target account
|
let localstack: StartedTestContainer;
|
||||||
it('assumes target account remediation role successfully', async () => {});
|
|
||||||
|
beforeAll(async () => {
|
||||||
|
localstack = await new GenericContainer('localstack/localstack:3')
|
||||||
|
.withEnv('SERVICES', 'sts,ec2,rds')
|
||||||
|
.withExposedPorts(4566).start();
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Role Assumption', () => {
|
||||||
|
it('successfully assumes target account remediation role via STS', async () => {});
|
||||||
|
it('fails when ExternalId does not match (Security)', async () => {});
|
||||||
|
it('handles STS credential expiration gracefully', async () => {});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Remediation Actions', () => {
|
||||||
it('executes ec2:StopInstances when remediation approved', async () => {});
|
it('executes ec2:StopInstances when remediation approved', async () => {});
|
||||||
it('executes rds:DeleteDBInstance with skip-final-snapshot', async () => {});
|
it('executes rds:StopDBInstance when remediation approved', async () => {});
|
||||||
|
it('fails safely when target IAM role lacks StopInstances permission', async () => {});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4.3 Slack API Contract (WireMock)
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
describe('Slack Integration', () => {
|
||||||
|
it('formats and delivers Block Kit message successfully', async () => {});
|
||||||
|
it('handles 429 Rate Limit by throwing retryable error for SQS visibility timeout', async () => {});
|
||||||
|
it('updates existing Slack message when anomaly is snoozed', async () => {});
|
||||||
});
|
});
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -159,24 +528,65 @@ describe('AWS Cross-Account Actions', () => {
|
|||||||
|
|
||||||
### 5.1 Critical User Journeys
|
### 5.1 Critical User Journeys
|
||||||
|
|
||||||
**Journey 1: Real-Time Anomaly Detection**
|
**Journey 1: Real-Time Anomaly Detection (The Golden Path)**
|
||||||
1. Send synthetic `RunInstances` event to EventBridge (p9.16xlarge, $40/hr).
|
```typescript
|
||||||
2. Verify system processes event and triggers fast-path (no baseline).
|
describe('E2E: Anomaly Detection', () => {
|
||||||
3. Verify Slack alert is generated with correct cost estimate.
|
it('detects anomaly and alerts Slack within 30 seconds', async () => {
|
||||||
|
// 1. Inject synthetic CloudTrail `RunInstances` event (p4d.24xlarge) into SQS Ingestion Queue
|
||||||
|
// 2. Poll DynamoDB to ensure CostEvent was recorded
|
||||||
|
// 3. Poll DynamoDB to ensure AnomalyRecord was created (fast-path triggered)
|
||||||
|
// 4. Assert WireMock received the Slack chat.postMessage call with Block Kit
|
||||||
|
});
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
**Journey 2: Interactive Remediation**
|
**Journey 2: Interactive Remediation**
|
||||||
1. Send webhook simulating user clicking "Stop Instance" in Slack.
|
```typescript
|
||||||
2. Verify API Gateway → Lambda executes `StopInstances` against LocalStack.
|
describe('E2E: Interactive Remediation', () => {
|
||||||
3. Verify Slack message updates to "Remediation Successful".
|
it('stops EC2 instance when user clicks Stop in Slack', async () => {
|
||||||
|
// 1. Simulate Slack sending interactive webhook payload for "Stop Instance"
|
||||||
|
// 2. Validate HMAC signature in API Gateway lambda
|
||||||
|
// 3. Verify LocalStack EC2 mock receives StopInstances call
|
||||||
|
// 4. Verify Slack message is updated to "Remediation Successful"
|
||||||
|
});
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
**Journey 3: Onboarding & First Scan**
|
||||||
|
```typescript
|
||||||
|
describe('E2E: Onboarding', () => {
|
||||||
|
it('validates IAM role and triggers initial zombie scan', async () => {
|
||||||
|
// 1. Trigger POST /v1/accounts with new role ARN
|
||||||
|
// 2. Verify account marked active
|
||||||
|
// 3. Verify EventBridge Scheduler creates cron for Zombie Hunter
|
||||||
|
});
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Section 6: Performance & Load Testing
|
## Section 6: Performance & Load Testing
|
||||||
|
|
||||||
|
### 6.1 Ingestion & Scoring Throughput
|
||||||
```typescript
|
```typescript
|
||||||
describe('Ingestion Throughput', () => {
|
describe('Performance: Alert Storm', () => {
|
||||||
it('processes 500 CloudTrail events/second via SQS FIFO', async () => {});
|
it('processes 1000 CloudTrail events/sec without SQS DLQ overflow', async () => {
|
||||||
it('DynamoDB baseline updates complete in <20ms p95', async () => {});
|
// k6 load test hitting SQS directly
|
||||||
|
});
|
||||||
|
|
||||||
|
it('DynamoDB baseline updates complete in <20ms p95 under load', async () => {
|
||||||
|
// Ensure Single-Table schema does not create hot partitions
|
||||||
|
});
|
||||||
|
|
||||||
|
it('Anomaly Scorer Lambda consumes <256MB memory during burst', async () => {});
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
### 6.2 Data Scale Tests
|
||||||
|
```typescript
|
||||||
|
describe('Performance: Baseline Scale', () => {
|
||||||
|
it('calculates Z-score in <5ms even when observed_actors set exceeds 1000', async () => {});
|
||||||
|
it('handles accounts with 100,000+ daily CostEvents without throttling DynamoDB (On-Demand scaling)', async () => {});
|
||||||
});
|
});
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -184,49 +594,119 @@ describe('Ingestion Throughput', () => {
|
|||||||
|
|
||||||
## Section 7: CI/CD Pipeline Integration
|
## Section 7: CI/CD Pipeline Integration
|
||||||
|
|
||||||
- **PR Gate:** Unit tests (<2min), Coverage >85% (Scoring engine >95%).
|
### 7.1 Pipeline Stages
|
||||||
- **Merge:** Integration tests with LocalStack & Testcontainers DynamoDB.
|
```
|
||||||
- **Staging:** E2E journeys against isolated staging AWS account.
|
┌─────────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐
|
||||||
|
│ Pre-Commit │───▶│ PR Gate │───▶│ Merge │───▶│ Staging │───▶│ Prod │
|
||||||
|
│ (local) │ │ (CI) │ │ (CI) │ │ (CD) │ │ (CD) │
|
||||||
|
└─────────────┘ └──────────┘ └──────────┘ └──────────┘ └──────────┘
|
||||||
|
lint + type unit tests integration E2E + perf canary
|
||||||
|
<10s math prop Testcontainers LocalStack <5 mins
|
||||||
|
tests <1m <4 mins <10 mins
|
||||||
|
```
|
||||||
|
|
||||||
|
### 7.2 Coverage Gates
|
||||||
|
| Component | Threshold |
|
||||||
|
|-----------|-----------|
|
||||||
|
| Anomaly Scorer (Math) | 100% |
|
||||||
|
| CloudTrail Normalizer | 95% |
|
||||||
|
| Governance Policy | 95% |
|
||||||
|
| Slack Signature Auth | 100% |
|
||||||
|
| Overall Pipeline | 85% |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Section 8: Transparent Factory Tenet Testing
|
## Section 8: Transparent Factory Tenet Testing
|
||||||
|
|
||||||
### 8.1 Atomic Flagging (Circuit Breaker)
|
### 8.1 Atomic Flagging
|
||||||
```typescript
|
```typescript
|
||||||
it('auto-disables scoring rule if it generates >10 alerts/hour for single tenant', () => {});
|
describe('Atomic Flagging', () => {
|
||||||
|
it('auto-disables scoring rule flag if alert volume exceeds 3x baseline in 1hr', () => {});
|
||||||
|
it('buffers suppressed anomalies in SQS DLQ while flag is off', () => {});
|
||||||
|
it('fails CI if any flag TTL exceeds 14 days', () => {});
|
||||||
|
it('evaluates flags strictly locally (in-memory provider)', () => {});
|
||||||
|
});
|
||||||
```
|
```
|
||||||
|
|
||||||
### 8.2 Configurable Autonomy (14-Day Auto-Promotion)
|
### 8.2 Elastic Schema
|
||||||
```typescript
|
```typescript
|
||||||
it('keeps new tenant in strict mode (log-only) for first 14 days', () => {});
|
describe('Elastic Schema', () => {
|
||||||
it('auto-promotes to audit mode (auto-alert) on day 15 if false-positive rate < 10%', () => {});
|
it('rejects DynamoDB table definition modifications that alter key schemas', () => {});
|
||||||
|
it('requires all DynamoDB item updates to use ADD/SET (additive only)', () => {});
|
||||||
|
it('ignores unknown attributes (V2 fields) in V1 CostEvent decoders', () => {});
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
### 8.3 Cognitive Durability
|
||||||
|
```typescript
|
||||||
|
describe('Cognitive Durability', () => {
|
||||||
|
it('requires decision_log.json for any PR modifying Z-score thresholds or weights', () => {});
|
||||||
|
it('enforces cyclomatic complexity < 10 for all AnomalyScorer math functions', () => {});
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
### 8.4 Semantic Observability
|
||||||
|
```typescript
|
||||||
|
describe('Semantic Observability', () => {
|
||||||
|
it('emits OTEL span for every Anomaly Scoring decision', () => {});
|
||||||
|
it('includes attributes: cost.z_score, cost.anomaly_score, cost.baseline_days', () => {});
|
||||||
|
it('includes cost.fast_path_triggered flag when baseline is bypassed', () => {});
|
||||||
|
it('hashes AWS Account ID in spans to protect PII/tenant identity', () => {});
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
### 8.5 Configurable Autonomy
|
||||||
|
```typescript
|
||||||
|
describe('Configurable Autonomy', () => {
|
||||||
|
it('keeps new tenant in Strict Mode (log-only) for first 14 days', () => {});
|
||||||
|
it('auto-promotes to Audit Mode on day 15 if false-positive rate < 10%', () => {});
|
||||||
|
it('Panic Mode halts ALL Slack alerts in <1 second via Redis check', () => {});
|
||||||
|
it('Panic Mode does NOT halt baseline recording (read-only tracking continues)', () => {});
|
||||||
|
});
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Section 9: Test Data & Fixtures
|
## Section 9: Test Data & Fixtures
|
||||||
|
|
||||||
```
|
### 9.1 Data Factories
|
||||||
fixtures/
|
```typescript
|
||||||
cloudtrail/
|
export const makeCloudTrailEvent = (overrides) => ({
|
||||||
ec2-runinstances.json
|
eventVersion: '1.08',
|
||||||
rds-create-db.json
|
userIdentity: { type: 'AssumedRole', arn: 'arn:aws:sts::123:assumed-role/user' },
|
||||||
lambda-create-function.json
|
eventTime: new Date().toISOString(),
|
||||||
baselines/
|
eventSource: 'ec2.amazonaws.com',
|
||||||
mature-steady-spend.json
|
eventName: 'RunInstances',
|
||||||
volatile-dev-account.json
|
requestParameters: { instanceType: 'm5.large' },
|
||||||
cold-start.json
|
...overrides
|
||||||
|
});
|
||||||
|
|
||||||
|
export const makeBaseline = (overrides) => ({
|
||||||
|
meanHourlyCost: 1.25,
|
||||||
|
stdDev: 0.15,
|
||||||
|
eventCount: 45,
|
||||||
|
ageDays: 16,
|
||||||
|
observedActors: ['arn:aws:iam::123:role/ci'],
|
||||||
|
observedInstanceTypes: ['t3.medium', 'm5.large'],
|
||||||
|
...overrides
|
||||||
|
});
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Section 10: TDD Implementation Order
|
## Section 10: TDD Implementation Order
|
||||||
|
|
||||||
1. **Phase 1:** Anomaly math + Unit tests (Strict TDD).
|
1. **Phase 1: Math & Core Logic (Strict TDD)**
|
||||||
2. **Phase 2:** CloudTrail normalizer + Pricing tables.
|
- Welford algorithm, Z-score math, Novelty scoring, `fast-check` property tests.
|
||||||
3. **Phase 3:** DynamoDB single-table implementation (Integration led).
|
2. **Phase 2: Ingestion & Normalization**
|
||||||
4. **Phase 4:** Slack formatting + Remediation Lambda.
|
- CloudTrail parsers, pricing static tables, event deduplication.
|
||||||
5. **Phase 5:** Governance policies (14-day promotion logic).
|
3. **Phase 3: Data Persistence (Integration Led)**
|
||||||
|
- DynamoDB Single-Table setup, TransactWriteItems, Testcontainers tests.
|
||||||
|
4. **Phase 4: Notifications & Slack Actions**
|
||||||
|
- Block Kit formatting, Slack signature validation, API Gateway endpoints.
|
||||||
|
5. **Phase 5: Governance & Tenets**
|
||||||
|
- 14-day promotion logic, Panic mode, OTEL tracing.
|
||||||
|
6. **Phase 6: E2E Pipeline**
|
||||||
|
- CDK definitions, LocalStack event injection, wire everything together.
|
||||||
|
|
||||||
*End of dd0c/cost Test Architecture*
|
*End of dd0c/cost Test Architecture (v2)*
|
||||||
|
|||||||
@@ -1760,3 +1760,527 @@ Before writing the `impl ExecutionEngine { pub async fn execute(...) }` function
|
|||||||
5. `engine_pauses_in_flight_execution_when_panic_mode_set`
|
5. `engine_pauses_in_flight_execution_when_panic_mode_set`
|
||||||
|
|
||||||
Only once these tests are defined can the state machine be implemented to make them pass (Green phase). This ensures no execution path can bypass the Trust Gradient.
|
Only once these tests are defined can the state machine be implemented to make them pass (Green phase). This ensures no execution path can bypass the Trust Gradient.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 11. Review Remediation Addendum (Post-Gemini Review)
|
||||||
|
|
||||||
|
The following sections address all gaps identified in the TDD review. These are net-new test specifications that must be integrated into the relevant sections above during implementation.
|
||||||
|
|
||||||
|
### 11.1 Missing Epic Coverage
|
||||||
|
|
||||||
|
#### Epic 3.4: Divergence Analysis
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// pkg/executor/divergence/tests.rs
|
||||||
|
|
||||||
|
#[test] fn divergence_detects_extra_command_not_in_runbook() {}
|
||||||
|
#[test] fn divergence_detects_modified_command_vs_prescribed() {}
|
||||||
|
#[test] fn divergence_detects_skipped_step_not_marked_as_skipped() {}
|
||||||
|
#[test] fn divergence_report_includes_diff_of_prescribed_vs_actual() {}
|
||||||
|
#[test] fn divergence_flags_env_var_changes_made_during_execution() {}
|
||||||
|
#[test] fn divergence_ignores_whitespace_differences_in_commands() {}
|
||||||
|
#[test] fn divergence_analysis_runs_automatically_after_execution_completes() {}
|
||||||
|
#[test] fn divergence_report_written_to_audit_trail() {}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn integration_divergence_analysis_detects_agent_side_extra_commands() {
|
||||||
|
// Agent executes an extra `whoami` not in the runbook
|
||||||
|
// Divergence analyzer must flag it
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Epic 5.3: Compliance Export
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// pkg/audit/export/tests.rs
|
||||||
|
|
||||||
|
#[tokio::test] async fn export_generates_valid_csv_for_date_range() {}
|
||||||
|
#[tokio::test] async fn export_generates_valid_pdf_with_execution_summary() {}
|
||||||
|
#[tokio::test] async fn export_uploads_to_s3_and_returns_presigned_url() {}
|
||||||
|
#[tokio::test] async fn export_presigned_url_expires_after_24_hours() {}
|
||||||
|
#[tokio::test] async fn export_scoped_to_tenant_via_rls() {}
|
||||||
|
#[tokio::test] async fn export_includes_hash_chain_verification_status() {}
|
||||||
|
#[tokio::test] async fn export_redacts_command_output_but_includes_hashes() {}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Epic 6.4: Classification Query API Rate Limiting
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// tests/integration/api_rate_limit_test.rs
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn api_rate_limit_30_requests_per_minute_per_tenant() {
|
||||||
|
let stack = E2EStack::start().await;
|
||||||
|
for i in 0..30 {
|
||||||
|
let resp = stack.api().get("/v1/run/classifications").send().await;
|
||||||
|
assert_eq!(resp.status(), 200);
|
||||||
|
}
|
||||||
|
// 31st request must be rate-limited
|
||||||
|
let resp = stack.api().get("/v1/run/classifications").send().await;
|
||||||
|
assert_eq!(resp.status(), 429);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn api_rate_limit_resets_after_60_seconds() {}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn api_rate_limit_is_per_tenant_not_global() {
|
||||||
|
// Tenant A hitting limit must not affect Tenant B
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn api_rate_limit_returns_retry_after_header() {}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Epic 7: Dashboard UI (Playwright)
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
// tests/e2e/ui/dashboard.spec.ts
|
||||||
|
|
||||||
|
test('parse preview renders within 5 seconds of paste', async ({ page }) => {
|
||||||
|
await page.goto('/dashboard/runbooks/new');
|
||||||
|
await page.fill('[data-testid="runbook-input"]', FIXTURE_RUNBOOK);
|
||||||
|
const preview = page.locator('[data-testid="parse-preview"]');
|
||||||
|
await expect(preview).toBeVisible({ timeout: 5000 });
|
||||||
|
await expect(preview.locator('.step-card')).toHaveCount(4);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('trust level visualization shows correct colors per step', async ({ page }) => {
|
||||||
|
// 🟢 safe = green, 🟡 caution = yellow, 🔴 dangerous = red
|
||||||
|
});
|
||||||
|
|
||||||
|
test('MTTR dashboard loads and displays chart', async ({ page }) => {
|
||||||
|
await page.goto('/dashboard/analytics');
|
||||||
|
await expect(page.locator('[data-testid="mttr-chart"]')).toBeVisible();
|
||||||
|
});
|
||||||
|
|
||||||
|
test('execution timeline shows real-time step progress', async ({ page }) => {});
|
||||||
|
test('approval modal requires typed confirmation for dangerous steps', async ({ page }) => {});
|
||||||
|
test('panic mode banner appears when panic is active', async ({ page }) => {});
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Epic 9: Onboarding & PLG
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// pkg/onboarding/tests.rs
|
||||||
|
|
||||||
|
#[test] fn free_tier_allows_5_runbooks() {}
|
||||||
|
#[test] fn free_tier_allows_50_executions_per_month() {}
|
||||||
|
#[test] fn free_tier_rejects_6th_runbook_with_upgrade_prompt() {}
|
||||||
|
#[test] fn free_tier_rejects_51st_execution_with_upgrade_prompt() {}
|
||||||
|
#[test] fn free_tier_counter_resets_monthly() {}
|
||||||
|
|
||||||
|
#[test] fn agent_install_snippet_includes_correct_api_key() {}
|
||||||
|
#[test] fn agent_install_snippet_includes_correct_gateway_url() {}
|
||||||
|
#[test] fn agent_install_snippet_is_valid_bash() {}
|
||||||
|
|
||||||
|
#[tokio::test] async fn stripe_checkout_creates_session_with_correct_pricing() {}
|
||||||
|
#[tokio::test] async fn stripe_webhook_checkout_completed_upgrades_tenant() {}
|
||||||
|
#[tokio::test] async fn stripe_webhook_subscription_deleted_downgrades_tenant() {}
|
||||||
|
#[tokio::test] async fn stripe_webhook_validates_signature() {}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.2 Agent-Side Security Tests (Zero-Trust Environment)
|
||||||
|
|
||||||
|
The Agent runs in customer VPCs — untrusted territory. These tests prove the Agent defends itself independently of the SaaS backend.
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// pkg/agent/security/tests.rs
|
||||||
|
|
||||||
|
// Agent-side deterministic blocking (mirrors SaaS scanner)
|
||||||
|
#[test] fn agent_scanner_blocks_rm_rf_independently_of_saas() {}
|
||||||
|
#[test] fn agent_scanner_blocks_kubectl_delete_namespace_independently() {}
|
||||||
|
#[test] fn agent_scanner_blocks_drop_table_independently() {}
|
||||||
|
#[test] fn agent_scanner_rejects_command_even_if_saas_says_safe() {
|
||||||
|
// Simulates compromised SaaS sending a "safe" classification for rm -rf
|
||||||
|
let saas_classification = Classification { risk: RiskLevel::Safe, .. };
|
||||||
|
let agent_result = agent_scanner.classify("rm -rf /");
|
||||||
|
assert_eq!(agent_result.risk, RiskLevel::Dangerous);
|
||||||
|
// Agent MUST override SaaS classification
|
||||||
|
}
|
||||||
|
|
||||||
|
// Binary integrity
|
||||||
|
#[test] fn agent_validates_binary_checksum_on_startup() {}
|
||||||
|
#[test] fn agent_refuses_to_start_if_checksum_mismatch() {}
|
||||||
|
|
||||||
|
// Payload tampering
|
||||||
|
#[tokio::test] async fn agent_rejects_grpc_payload_with_invalid_hmac() {}
|
||||||
|
#[tokio::test] async fn agent_rejects_grpc_payload_with_expired_timestamp() {}
|
||||||
|
#[tokio::test] async fn agent_rejects_grpc_payload_with_mismatched_execution_id() {}
|
||||||
|
|
||||||
|
// Local fallback when SaaS is unreachable
|
||||||
|
#[tokio::test] async fn agent_falls_back_to_scanner_only_when_saas_disconnected() {}
|
||||||
|
#[tokio::test] async fn agent_in_fallback_mode_treats_all_unknowns_as_caution() {}
|
||||||
|
#[tokio::test] async fn agent_reconnects_automatically_when_saas_returns() {}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.3 Realistic Sandbox Matrix
|
||||||
|
|
||||||
|
Replace Alpine-only sandbox with a matrix of realistic execution targets.
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// tests/integration/sandbox_matrix_test.rs
|
||||||
|
|
||||||
|
#[rstest]
|
||||||
|
#[case("ubuntu:22.04")]
|
||||||
|
#[case("amazonlinux:2023")]
|
||||||
|
#[case("alpine:3.19")]
|
||||||
|
async fn sandbox_safe_command_executes_on_all_targets(#[case] image: &str) {
|
||||||
|
let sandbox = SandboxContainer::start(image).await;
|
||||||
|
let agent = TestAgent::connect_to(sandbox.socket_path()).await;
|
||||||
|
let result = agent.execute("ls /tmp").await.unwrap();
|
||||||
|
assert_eq!(result.exit_code, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[rstest]
|
||||||
|
#[case("ubuntu:22.04")]
|
||||||
|
#[case("amazonlinux:2023")]
|
||||||
|
async fn sandbox_dangerous_command_blocked_on_all_targets(#[case] image: &str) {
|
||||||
|
let sandbox = SandboxContainer::start(image).await;
|
||||||
|
let agent = TestAgent::connect_to(sandbox.socket_path()).await;
|
||||||
|
let result = agent.execute("rm -rf /").await;
|
||||||
|
assert!(result.is_err());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Non-root execution
|
||||||
|
#[tokio::test]
|
||||||
|
async fn sandbox_agent_runs_as_non_root_user() {
|
||||||
|
let sandbox = SandboxContainer::start_as_user("ubuntu:22.04", "dd0c-agent").await;
|
||||||
|
let agent = TestAgent::connect_to(sandbox.socket_path()).await;
|
||||||
|
let result = agent.execute("whoami").await.unwrap();
|
||||||
|
assert_eq!(result.stdout.trim(), "dd0c-agent");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn sandbox_non_root_agent_cannot_escalate_to_root() {
|
||||||
|
let sandbox = SandboxContainer::start_as_user("ubuntu:22.04", "dd0c-agent").await;
|
||||||
|
let agent = TestAgent::connect_to(sandbox.socket_path()).await;
|
||||||
|
let result = agent.execute("sudo cat /etc/shadow").await;
|
||||||
|
assert!(result.is_err() || result.unwrap().exit_code != 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// RBAC-restricted K3s
|
||||||
|
#[tokio::test]
|
||||||
|
async fn sandbox_k3s_rbac_denies_kubectl_delete_namespace() {
|
||||||
|
let k3s = K3sContainer::start_with_rbac("read-only-role").await;
|
||||||
|
let agent = TestAgent::with_kubeconfig(k3s.kubeconfig()).await;
|
||||||
|
let result = agent.execute("kubectl delete namespace default").await;
|
||||||
|
// Should be blocked by BOTH scanner AND K8s RBAC
|
||||||
|
assert!(result.is_err());
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.4 Advanced Command Injection Tests
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// pkg/classifier/scanner/injection_tests.rs
|
||||||
|
|
||||||
|
// Semicolon injection
|
||||||
|
#[test] fn scanner_semicolon_rm_rf_is_dangerous() {
|
||||||
|
assert_dangerous("echo hello; rm -rf /");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pipe injection
|
||||||
|
#[test] fn scanner_pipe_to_rm_is_dangerous() {
|
||||||
|
assert_dangerous("find / -name '*.log' | xargs rm -rf");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Backtick injection
|
||||||
|
#[test] fn scanner_backtick_rm_is_dangerous() {
|
||||||
|
assert_dangerous("echo `rm -rf /`");
|
||||||
|
}
|
||||||
|
|
||||||
|
// $() substitution (already tested, but more variants)
|
||||||
|
#[test] fn scanner_nested_substitution_is_dangerous() {
|
||||||
|
assert_dangerous("echo $(echo $(rm -rf /))");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Newline injection
|
||||||
|
#[test] fn scanner_newline_injection_is_dangerous() {
|
||||||
|
assert_dangerous("echo safe\nrm -rf /");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Null byte injection
|
||||||
|
#[test] fn scanner_null_byte_injection_is_dangerous() {
|
||||||
|
assert_dangerous("echo safe\0rm -rf /");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Unicode homoglyph attack
|
||||||
|
#[test] fn scanner_unicode_homoglyph_rm_is_dangerous() {
|
||||||
|
// Using Cyrillic 'р' and 'м' that look like 'r' and 'm'
|
||||||
|
assert_dangerous("rм -rf /"); // Should still catch this
|
||||||
|
}
|
||||||
|
|
||||||
|
// Base64 encoded payload
|
||||||
|
#[test] fn scanner_base64_decode_pipe_bash_is_dangerous() {
|
||||||
|
assert_dangerous("echo cm0gLXJmIC8= | base64 -d | bash");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Heredoc injection
|
||||||
|
#[test] fn scanner_heredoc_with_destructive_is_dangerous() {
|
||||||
|
assert_dangerous("cat << EOF | bash\nrm -rf /\nEOF");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Environment variable expansion
|
||||||
|
#[test] fn scanner_env_var_expansion_to_rm_is_dangerous() {
|
||||||
|
assert_dangerous("$CMD"); // Unknown variable expansion = unknown, not safe
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.5 Privilege Escalation Tests
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// pkg/classifier/scanner/escalation_tests.rs
|
||||||
|
|
||||||
|
#[test] fn scanner_sudo_anything_is_at_least_caution() {
|
||||||
|
assert_at_least_caution("sudo systemctl restart nginx");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test] fn scanner_sudo_rm_is_dangerous() {
|
||||||
|
assert_dangerous("sudo rm -rf /var/log");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test] fn scanner_su_root_is_dangerous() {
|
||||||
|
assert_dangerous("su - root -c 'rm -rf /'");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test] fn scanner_chmod_suid_is_dangerous() {
|
||||||
|
assert_dangerous("chmod u+s /usr/bin/find");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test] fn scanner_chown_root_is_caution() {
|
||||||
|
assert_at_least_caution("chown root:root /tmp/exploit");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test] fn scanner_nsenter_is_dangerous() {
|
||||||
|
assert_dangerous("nsenter --target 1 --mount --uts --ipc --net --pid");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test] fn scanner_docker_run_privileged_is_dangerous() {
|
||||||
|
assert_dangerous("docker run --privileged -v /:/host ubuntu");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test] fn scanner_kubectl_exec_as_root_is_caution() {
|
||||||
|
assert_at_least_caution("kubectl exec -it pod -- /bin/bash");
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.6 Rollback Failure & Nested Failure Tests
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// pkg/executor/rollback/tests.rs
|
||||||
|
|
||||||
|
#[test] fn rollback_failure_transitions_to_manual_intervention() {
|
||||||
|
let mut engine = ExecutionEngine::new();
|
||||||
|
engine.transition(State::RollingBack);
|
||||||
|
engine.report_rollback_failure("rollback command timed out");
|
||||||
|
assert_eq!(engine.state(), State::ManualIntervention);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test] fn rollback_failure_does_not_retry_automatically() {
|
||||||
|
// Rollback failures are terminal — no auto-retry
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test] fn rollback_timeout_kills_rollback_process_after_300s() {}
|
||||||
|
|
||||||
|
#[test] fn rollback_hanging_indefinitely_triggers_manual_intervention_after_timeout() {
|
||||||
|
let mut engine = ExecutionEngine::with_rollback_timeout(Duration::from_secs(5));
|
||||||
|
engine.transition(State::RollingBack);
|
||||||
|
// Simulate rollback that never completes
|
||||||
|
tokio::time::advance(Duration::from_secs(6)).await;
|
||||||
|
assert_eq!(engine.state(), State::ManualIntervention);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test] fn manual_intervention_state_sends_slack_alert_to_oncall() {}
|
||||||
|
#[test] fn manual_intervention_state_logs_full_context_to_audit() {}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.7 Double Execution & Network Partition Tests
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// pkg/executor/idempotency/tests.rs
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn agent_reconnect_after_partition_resyncs_already_executed_step() {
|
||||||
|
let stack = E2EStack::start().await;
|
||||||
|
let execution = stack.start_execution().await;
|
||||||
|
|
||||||
|
// Agent executes step successfully
|
||||||
|
stack.wait_for_step_state(&execution.id, &step_id, "executing").await;
|
||||||
|
|
||||||
|
// Network partition AFTER execution but BEFORE ACK
|
||||||
|
stack.partition_agent().await;
|
||||||
|
|
||||||
|
// Agent reconnects
|
||||||
|
stack.heal_partition().await;
|
||||||
|
|
||||||
|
// Engine must recognize step was already executed — no double execution
|
||||||
|
let step = stack.get_step(&execution.id, &step_id).await;
|
||||||
|
assert_eq!(step.execution_count, 1); // Exactly once
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn engine_does_not_re_send_command_after_agent_reconnect_if_step_completed() {}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn engine_re_sends_command_if_agent_never_started_execution_before_partition() {}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.8 Slack Payload Forgery Tests
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// tests/integration/slack_security_test.rs
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn slack_approval_webhook_rejects_missing_signature() {
|
||||||
|
let resp = stack.api()
|
||||||
|
.post("/v1/run/slack/actions")
|
||||||
|
.json(&fixture_approval_payload())
|
||||||
|
// No X-Slack-Signature header
|
||||||
|
.send().await;
|
||||||
|
assert_eq!(resp.status(), 401);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn slack_approval_webhook_rejects_invalid_signature() {
|
||||||
|
let resp = stack.api()
|
||||||
|
.post("/v1/run/slack/actions")
|
||||||
|
.header("X-Slack-Signature", "v0=invalid_hmac")
|
||||||
|
.header("X-Slack-Request-Timestamp", &now_timestamp())
|
||||||
|
.json(&fixture_approval_payload())
|
||||||
|
.send().await;
|
||||||
|
assert_eq!(resp.status(), 401);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn slack_approval_webhook_rejects_replayed_timestamp() {
|
||||||
|
// Timestamp older than 5 minutes
|
||||||
|
let resp = stack.api()
|
||||||
|
.post("/v1/run/slack/actions")
|
||||||
|
.header("X-Slack-Signature", &valid_signature_for_old_timestamp())
|
||||||
|
.header("X-Slack-Request-Timestamp", &five_minutes_ago())
|
||||||
|
.json(&fixture_approval_payload())
|
||||||
|
.send().await;
|
||||||
|
assert_eq!(resp.status(), 401);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn slack_approval_webhook_rejects_cross_tenant_approval() {
|
||||||
|
// Tenant A's user trying to approve Tenant B's execution
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.9 Audit Log Encryption Tests
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// tests/integration/audit_encryption_test.rs
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn audit_log_command_field_is_encrypted_at_rest() {
|
||||||
|
let db = TestDb::start().await;
|
||||||
|
// Insert an audit event with a command
|
||||||
|
insert_audit_event(&db, "kubectl get pods").await;
|
||||||
|
|
||||||
|
// Read raw bytes from PostgreSQL — must NOT contain plaintext command
|
||||||
|
let raw = db.query_raw_bytes("SELECT command FROM audit_events LIMIT 1").await;
|
||||||
|
assert!(!String::from_utf8_lossy(&raw).contains("kubectl get pods"),
|
||||||
|
"Command stored in plaintext — must be encrypted");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn audit_log_output_field_is_encrypted_at_rest() {
|
||||||
|
let db = TestDb::start().await;
|
||||||
|
insert_audit_event_with_output(&db, "sensitive output data").await;
|
||||||
|
|
||||||
|
let raw = db.query_raw_bytes("SELECT output FROM audit_events LIMIT 1").await;
|
||||||
|
assert!(!String::from_utf8_lossy(&raw).contains("sensitive output data"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn audit_log_decryption_requires_kms_key() {
|
||||||
|
// Verify the app role can decrypt using the KMS key
|
||||||
|
let db = TestDb::start().await;
|
||||||
|
insert_audit_event(&db, "kubectl get pods").await;
|
||||||
|
|
||||||
|
let decrypted = db.as_app_role()
|
||||||
|
.query("SELECT decrypt_command(command) FROM audit_events LIMIT 1").await;
|
||||||
|
assert_eq!(decrypted, "kubectl get pods");
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.10 gRPC Output Buffer Limits
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// pkg/agent/streaming/tests.rs
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn agent_truncates_stdout_at_10mb() {
|
||||||
|
let sandbox = SandboxContainer::start("ubuntu:22.04").await;
|
||||||
|
let agent = TestAgent::connect_to(sandbox.socket_path()).await;
|
||||||
|
|
||||||
|
// Generate 50MB of output
|
||||||
|
let result = agent.execute("dd if=/dev/urandom bs=1M count=50 | base64").await.unwrap();
|
||||||
|
|
||||||
|
// Agent must truncate, not OOM
|
||||||
|
assert!(result.stdout.len() <= 10 * 1024 * 1024);
|
||||||
|
assert!(result.truncated);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn agent_streams_output_in_chunks_not_buffered() {
|
||||||
|
// Verify output arrives incrementally, not all at once after completion
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn agent_memory_stays_under_256mb_during_large_output() {
|
||||||
|
// Memory profiling test — agent must not OOM on `cat /dev/urandom`
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn engine_handles_truncated_output_gracefully() {
|
||||||
|
// Engine receives truncated flag and logs warning
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.11 Parse SLA End-to-End Benchmark
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// benches/parse_sla_bench.rs
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn parse_plus_classify_pipeline_under_5s_p95() {
|
||||||
|
let stack = E2EStack::start().await;
|
||||||
|
let mut latencies = vec![];
|
||||||
|
|
||||||
|
for _ in 0..100 {
|
||||||
|
let start = Instant::now();
|
||||||
|
stack.api()
|
||||||
|
.post("/v1/run/runbooks/parse-preview")
|
||||||
|
.json(&json!({ "raw_text": FIXTURE_RUNBOOK_10_STEPS }))
|
||||||
|
.send().await;
|
||||||
|
latencies.push(start.elapsed());
|
||||||
|
}
|
||||||
|
|
||||||
|
let p95 = percentile(&latencies, 95);
|
||||||
|
assert!(p95 < Duration::from_secs(5),
|
||||||
|
"Parse+Classify p95 latency: {:?} — exceeds 5s SLA", p95);
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.12 Updated Test Pyramid (Post-Review)
|
||||||
|
|
||||||
|
The Execution Engine ratio shifts from 80/15/5 to 60/30/10 per review recommendation:
|
||||||
|
|
||||||
|
| Component | Unit | Integration | E2E |
|
||||||
|
|-----------|------|-------------|-----|
|
||||||
|
| Safety Scanner | 80% | 15% | 5% |
|
||||||
|
| Merge Engine | 90% | 10% | 0% |
|
||||||
|
| Execution Engine | **60%** | **30%** | **10%** |
|
||||||
|
| Parser | 50% | 40% | 10% |
|
||||||
|
| Approval Workflow | 70% | 20% | 10% |
|
||||||
|
| Audit Trail | 60% | 35% | 5% |
|
||||||
|
| Agent | 50% | 35% | 15% |
|
||||||
|
| Dashboard API | 40% | 50% | 10% |
|
||||||
|
|
||||||
|
*End of Review Remediation Addendum*
|
||||||
|
|||||||
226
products/plg-instrumentation-brainstorm.md
Normal file
226
products/plg-instrumentation-brainstorm.md
Normal file
@@ -0,0 +1,226 @@
|
|||||||
|
# dd0c Platform — PLG Instrumentation Brainstorm
|
||||||
|
|
||||||
|
**Session:** Carson (Brainstorming Coach) — Cross-Product PLG Analytics
|
||||||
|
**Date:** March 1, 2026
|
||||||
|
**Scope:** All 6 dd0c products
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## The Problem
|
||||||
|
|
||||||
|
We built 6 products with onboarding flows, free tiers, and Stripe billing — but zero product analytics. We can't answer:
|
||||||
|
|
||||||
|
- How many users hit "aha moment" vs. bounce?
|
||||||
|
- Where in the funnel do free users drop off before upgrading?
|
||||||
|
- Which features drive retention vs. which are ignored?
|
||||||
|
- Are users churning because of alert fatigue, false positives, or just not getting value?
|
||||||
|
- What's our time-to-first-value per product?
|
||||||
|
|
||||||
|
Without instrumentation, PLG iteration is guesswork.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Brainstorm: What to Instrument
|
||||||
|
|
||||||
|
### 1. Unified Event Taxonomy
|
||||||
|
|
||||||
|
Every dd0c product shares a common event naming convention:
|
||||||
|
|
||||||
|
```
|
||||||
|
<domain>.<object>.<action>
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
account.signup.completed
|
||||||
|
account.aws.connected
|
||||||
|
anomaly.alert.sent
|
||||||
|
anomaly.alert.snoozed
|
||||||
|
slack.bot.installed
|
||||||
|
billing.checkout.started
|
||||||
|
billing.upgrade.completed
|
||||||
|
feature.flag.evaluated
|
||||||
|
```
|
||||||
|
|
||||||
|
**Rules:**
|
||||||
|
- Past tense for completed actions (`completed`, `sent`, `clicked`)
|
||||||
|
- Present tense for state changes (`active`, `learning`, `paused`)
|
||||||
|
- Always include `tenant_id`, `timestamp`, `product` (route/drift/alert/portal/cost/run)
|
||||||
|
- Never include PII — hash emails, account IDs
|
||||||
|
|
||||||
|
### 2. Per-Product Activation Metrics
|
||||||
|
|
||||||
|
The "aha moment" is different for each product:
|
||||||
|
|
||||||
|
| Product | Aha Moment | Metric | Target |
|
||||||
|
|---------|-----------|--------|--------|
|
||||||
|
| dd0c/route | First dollar saved by model routing | `routing.savings.first_dollar` | <24hr from signup |
|
||||||
|
| dd0c/drift | First drift detected in real stack | `drift.detection.first_found` | <1hr from agent install |
|
||||||
|
| dd0c/alert | First alert correlated (not just forwarded) | `alert.correlation.first_match` | <60sec from first alert |
|
||||||
|
| dd0c/portal | First service auto-discovered | `portal.discovery.first_service` | <5min from install |
|
||||||
|
| dd0c/cost | First anomaly detected in real account | `cost.anomaly.first_detected` | <24hr from AWS connect |
|
||||||
|
| dd0c/run | First runbook executed successfully | `run.execution.first_success` | <10min from setup |
|
||||||
|
|
||||||
|
### 3. Conversion Funnel (Universal)
|
||||||
|
|
||||||
|
Every product shares this funnel shape:
|
||||||
|
|
||||||
|
```
|
||||||
|
Signup → Connect (AWS/Slack/Git) → First Value → Habit → Upgrade
|
||||||
|
```
|
||||||
|
|
||||||
|
Events per stage:
|
||||||
|
|
||||||
|
**Stage 1: Signup**
|
||||||
|
- `account.signup.started` — landed on signup page
|
||||||
|
- `account.signup.completed` — account created
|
||||||
|
- `account.signup.method` — github_sso / google_sso / email
|
||||||
|
|
||||||
|
**Stage 2: Connect**
|
||||||
|
- `account.integration.started` — began connecting external service
|
||||||
|
- `account.integration.completed` — connection verified
|
||||||
|
- `account.integration.failed` — connection failed (include `error_type`)
|
||||||
|
- Product-specific: `account.aws.connected`, `account.slack.installed`, `account.git.connected`
|
||||||
|
|
||||||
|
**Stage 3: First Value**
|
||||||
|
- Product-specific aha moment event (see table above)
|
||||||
|
- `onboarding.wizard.step_completed` — which step, how long
|
||||||
|
- `onboarding.wizard.abandoned` — which step they quit on
|
||||||
|
|
||||||
|
**Stage 4: Habit**
|
||||||
|
- `session.daily.active` — DAU ping
|
||||||
|
- `session.weekly.active` — WAU ping
|
||||||
|
- `feature.<name>.used` — per-feature usage
|
||||||
|
- `notification.digest.opened` — are they reading digests?
|
||||||
|
- `slack.command.used` — which slash commands, how often
|
||||||
|
|
||||||
|
**Stage 5: Upgrade**
|
||||||
|
- `billing.checkout.started`
|
||||||
|
- `billing.checkout.completed`
|
||||||
|
- `billing.checkout.abandoned`
|
||||||
|
- `billing.plan.changed` — upgrade/downgrade
|
||||||
|
- `billing.churn.detected` — subscription cancelled
|
||||||
|
|
||||||
|
### 4. Feature Usage Events (Per Product)
|
||||||
|
|
||||||
|
**dd0c/route (LLM Cost Router)**
|
||||||
|
- `routing.request.processed` — model selected, latency, cost
|
||||||
|
- `routing.override.manual` — user forced a specific model
|
||||||
|
- `routing.savings.calculated` — weekly savings digest generated
|
||||||
|
- `routing.shadow.audit.run` — shadow mode comparison completed
|
||||||
|
- `dashboard.cost.viewed` — opened cost dashboard
|
||||||
|
|
||||||
|
**dd0c/drift (IaC Drift Detection)**
|
||||||
|
- `drift.scan.completed` — scan finished, drifts found count
|
||||||
|
- `drift.remediation.clicked` — user clicked "fix drift"
|
||||||
|
- `drift.remediation.applied` — drift actually fixed
|
||||||
|
- `drift.false_positive.marked` — user dismissed a drift
|
||||||
|
- `drift.agent.heartbeat` — agent is alive and scanning
|
||||||
|
|
||||||
|
**dd0c/alert (Alert Intelligence)**
|
||||||
|
- `alert.ingested` — raw alert received
|
||||||
|
- `alert.correlated` — alerts grouped into incident
|
||||||
|
- `alert.suppressed` — duplicate/noise suppressed
|
||||||
|
- `alert.escalated` — sent to on-call
|
||||||
|
- `alert.feedback.helpful` / `alert.feedback.noise` — user feedback
|
||||||
|
- `alert.mttr.measured` — time from alert to resolution
|
||||||
|
|
||||||
|
**dd0c/portal (Lightweight IDP)**
|
||||||
|
- `portal.service.discovered` — auto-discovery found a service
|
||||||
|
- `portal.service.claimed` — team claimed ownership
|
||||||
|
- `portal.scorecard.viewed` — someone checked service health
|
||||||
|
- `portal.scorecard.action_taken` — acted on a recommendation
|
||||||
|
- `portal.search.performed` — searched the catalog
|
||||||
|
|
||||||
|
**dd0c/cost (AWS Cost Anomaly)**
|
||||||
|
- `cost.event.ingested` — CloudTrail event processed
|
||||||
|
- `cost.anomaly.scored` — anomaly scoring completed
|
||||||
|
- `cost.anomaly.alerted` — Slack alert sent
|
||||||
|
- `cost.anomaly.snoozed` — user snoozed alert
|
||||||
|
- `cost.anomaly.expected` — user marked as expected
|
||||||
|
- `cost.remediation.clicked` — user clicked Stop/Terminate
|
||||||
|
- `cost.remediation.executed` — remediation completed
|
||||||
|
- `cost.zombie.detected` — idle resource found
|
||||||
|
- `cost.digest.sent` — daily digest delivered
|
||||||
|
|
||||||
|
**dd0c/run (Runbook Automation)**
|
||||||
|
- `run.runbook.created` — new runbook authored
|
||||||
|
- `run.execution.started` — runbook execution began
|
||||||
|
- `run.execution.completed` — execution finished (include `success`/`failed`)
|
||||||
|
- `run.execution.approval_requested` — human approval needed
|
||||||
|
- `run.execution.approval_granted` — human approved
|
||||||
|
- `run.execution.rolled_back` — rollback triggered
|
||||||
|
- `run.sandbox.test.run` — dry-run in sandbox
|
||||||
|
|
||||||
|
### 5. Health Scoring (Churn Prediction)
|
||||||
|
|
||||||
|
Composite health score per tenant, updated daily:
|
||||||
|
|
||||||
|
```
|
||||||
|
health_score = (
|
||||||
|
0.3 * activation_complete + // did they hit aha moment?
|
||||||
|
0.2 * weekly_active_days + // how many days active this week?
|
||||||
|
0.2 * feature_breadth + // how many features used?
|
||||||
|
0.15 * integration_depth + // how many integrations connected?
|
||||||
|
0.15 * feedback_sentiment // positive vs negative actions
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
Thresholds:
|
||||||
|
- `health > 0.7` → Healthy (green)
|
||||||
|
- `health 0.4-0.7` → At Risk (yellow) → trigger re-engagement email
|
||||||
|
- `health < 0.4` → Churning (red) → trigger founder outreach
|
||||||
|
|
||||||
|
### 6. Analytics Stack Recommendation
|
||||||
|
|
||||||
|
**PostHog** (self-hosted on AWS):
|
||||||
|
- Open source, self-hostable → no vendor lock-in
|
||||||
|
- Free tier: unlimited events self-hosted
|
||||||
|
- Built-in: funnels, retention, feature flags, session replay
|
||||||
|
- Supports custom events via REST API or JS/Python SDK
|
||||||
|
- Can run on a single t3.medium for V1 traffic
|
||||||
|
|
||||||
|
**Why not Segment/Amplitude/Mixpanel:**
|
||||||
|
- Segment: $120/mo minimum, overkill for solo founder
|
||||||
|
- Amplitude: free tier is generous but cloud-only, data leaves your infra
|
||||||
|
- Mixpanel: same cloud-only concern
|
||||||
|
- PostHog self-hosted: $0/mo, data stays in your AWS account, GDPR-friendly
|
||||||
|
|
||||||
|
**Integration pattern:**
|
||||||
|
```
|
||||||
|
Lambda/API → PostHog REST API (async, fire-and-forget)
|
||||||
|
Next.js UI → PostHog JS SDK (auto-captures pageviews, clicks)
|
||||||
|
Slack Bot → PostHog Python SDK (command usage, action clicks)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 7. Cross-Product Flywheel Metrics
|
||||||
|
|
||||||
|
dd0c is a platform — users on one product should discover others:
|
||||||
|
|
||||||
|
- `platform.cross_sell.impression` — "Try dd0c/alert" banner shown
|
||||||
|
- `platform.cross_sell.clicked` — user clicked cross-sell
|
||||||
|
- `platform.cross_sell.activated` — user activated second product
|
||||||
|
- `platform.products.active_count` — how many dd0c products per tenant
|
||||||
|
|
||||||
|
**Flywheel hypothesis:** Users who activate 2+ dd0c products have 3x lower churn than single-product users. We need data to prove/disprove this.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Epic 11 Proposal: PLG Instrumentation
|
||||||
|
|
||||||
|
### Scope
|
||||||
|
Cross-cutting epic added to all 6 products. Shared analytics SDK, per-product event implementations, funnel dashboards, health scoring.
|
||||||
|
|
||||||
|
### Stories (Draft)
|
||||||
|
1. **PostHog Infrastructure** — CDK stack for self-hosted PostHog on ECS Fargate
|
||||||
|
2. **Analytics SDK** — Shared TypeScript/Python wrapper with standard event schema
|
||||||
|
3. **Funnel Dashboard** — PostHog dashboard template per product
|
||||||
|
4. **Activation Tracking** — Per-product aha moment detection and logging
|
||||||
|
5. **Health Scoring Engine** — Daily cron that computes tenant health scores
|
||||||
|
6. **Cross-Sell Instrumentation** — Platform-level cross-product discovery events
|
||||||
|
7. **Churn Alert Pipeline** — Health score → Slack alert to founder when tenant goes red
|
||||||
|
|
||||||
|
### Estimate
|
||||||
|
~25 story points across all products (shared infrastructure + per-product event wiring)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
*This brainstorm establishes the "what" and "why." Party Mode advisory board should stress-test: Is PostHog the right choice? Is the event taxonomy too granular? Should health scoring be V1 or V2? Is 25 points realistic?*
|
||||||
Reference in New Issue
Block a user