feat: confluence benchmark, pattern extractor, agent KB, UX spec

- extract-patterns.js: mines layered arch, ArgoCD appsets, cloud regions, CIDR allocations, naming conventions, sync waves, tech stack from code - agent-kb.js: token-efficient JSON rendering of same doc tree - eval-confluence-ref-questions.json: 32 reference-only benchmark questions - wiggum-v2.sh: Ralph Wiggum loop targeting confluence baseline (77.8%) - docs/human-ux-spec.md: BMad UX designer spec for human doc structure - Eval results: V2 at 28.7% vs confluence 77.8% baseline - Hub/spoke ownership now correctly extracted (95% on that question) - Naming conventions, regions, CIDRs surfaced in system-architecture.md
2026-03-10 14:20:35 +00:00
parent 049609a358
commit 0265ec7a60
844 changed files with 2129910 additions and 30 deletions
--- a/promptfoo.yaml
+++ b/promptfoo.yaml
@@ -0,0 +1,136 @@
+description: "Dev Intel V3 - Documentation Quality Eval"
+
+providers:
+  - id: openai:chat:claude-haiku-4.5
+    config:
+      apiBaseUrl: http://192.168.86.11:8000/v1
+      apiKey: my-super-secret-password-123
+      temperature: 0
+
+prompts:
+  - |
+    You are evaluating auto-generated infrastructure documentation.
+    Read the docs directory at {{docsDir}} and answer this question:
+
+    {{question}}
+
+    Be specific and cite file paths where possible.
+
+tests:
+  # Structural
+  - vars:
+      docsDir: ./foxtrot-docs-v3
+      question: "How many subsystems does the Foxtrot monorepo contain? List them."
+    assert:
+      - type: llm-rubric
+        value: "Answer should list 12 subsystems including account-common, app-common, app-tools, compute-common, compute-tools, control-core, ipam-core, ipam-tools, network-common, network-core, runtime, and root"
+        provider: openai:chat:claude-haiku-4.5
+        config:
+          apiBaseUrl: http://192.168.86.11:8000/v1
+          apiKey: my-super-secret-password-123
+
+  - vars:
+      docsDir: ./foxtrot-docs-v3
+      question: "Which 5 Helm charts produce the most Kubernetes resources?"
+    assert:
+      - type: llm-rubric
+        value: "Answer should identify specific charts and their resource counts from the documentation"
+        provider: openai:chat:claude-haiku-4.5
+        config:
+          apiBaseUrl: http://192.168.86.11:8000/v1
+          apiKey: my-super-secret-password-123
+
+  # Dependencies
+  - vars:
+      docsDir: ./foxtrot-docs-v3
+      question: "What are the dependencies of the external-dns chart?"
+    assert:
+      - type: llm-rubric
+        value: "Answer should list the sub-chart dependencies of external-dns from the Helm index"
+        provider: openai:chat:claude-haiku-4.5
+        config:
+          apiBaseUrl: http://192.168.86.11:8000/v1
+          apiKey: my-super-secret-password-123
+
+  - vars:
+      docsDir: ./foxtrot-docs-v3
+      question: "What are the dependencies of the ingress-nginx chart?"
+    assert:
+      - type: llm-rubric
+        value: "Answer should list the sub-chart dependencies of ingress-nginx"
+        provider: openai:chat:claude-haiku-4.5
+        config:
+          apiBaseUrl: http://192.168.86.11:8000/v1
+          apiKey: my-super-secret-password-123
+
+  # Configuration
+  - vars:
+      docsDir: ./foxtrot-docs-v3
+      question: "How many Helm charts define an mdm-app deployment?"
+    assert:
+      - type: llm-rubric
+        value: "Answer should provide a count of charts with mdm-app resources"
+        provider: openai:chat:claude-haiku-4.5
+        config:
+          apiBaseUrl: http://192.168.86.11:8000/v1
+          apiKey: my-super-secret-password-123
+
+  # Terraform
+  - vars:
+      docsDir: ./foxtrot-docs-v3
+      question: "How many Terraform module directories are documented? List the top 5."
+    assert:
+      - type: llm-rubric
+        value: "Answer should reference the terraform index and list specific module directories from control-core or other subsystems"
+        provider: openai:chat:claude-haiku-4.5
+        config:
+          apiBaseUrl: http://192.168.86.11:8000/v1
+          apiKey: my-super-secret-password-123
+
+  # Architecture
+  - vars:
+      docsDir: ./foxtrot-docs-v3
+      question: "Which subsystem has the most files and why?"
+    assert:
+      - type: llm-rubric
+        value: "Answer should identify runtime as having the most files (~16K) and explain it contains rendered manifests or deployment artifacts"
+        provider: openai:chat:claude-haiku-4.5
+        config:
+          apiBaseUrl: http://192.168.86.11:8000/v1
+          apiKey: my-super-secret-password-123
+
+  # Cross-subsystem
+  - vars:
+      docsDir: ./foxtrot-docs-v3
+      question: "Which subsystems have zero functions and what does that indicate architecturally?"
+    assert:
+      - type: llm-rubric
+        value: "Answer should identify account-common, network-common, network-core, control-core as zero-function subsystems and explain they are declarative/IaC configuration subsystems"
+        provider: openai:chat:claude-haiku-4.5
+        config:
+          apiBaseUrl: http://192.168.86.11:8000/v1
+          apiKey: my-super-secret-password-123
+
+  # Impact
+  - vars:
+      docsDir: ./foxtrot-docs-v3
+      question: "What is the blast radius of modifying a Terraform module in control-core?"
+    assert:
+      - type: llm-rubric
+        value: "Answer should reference the change impact analysis and describe downstream dependents"
+        provider: openai:chat:claude-haiku-4.5
+        config:
+          apiBaseUrl: http://192.168.86.11:8000/v1
+          apiKey: my-super-secret-password-123
+
+  # Entry Points
+  - vars:
+      docsDir: ./foxtrot-docs-v3
+      question: "What entry points were detected in the codebase?"
+    assert:
+      - type: llm-rubric
+        value: "Answer should reference detected entry points like Helm workloads, Python main, shell main, or CI pipelines"
+        provider: openai:chat:claude-haiku-4.5
+        config:
+          apiBaseUrl: http://192.168.86.11:8000/v1
+          apiKey: my-super-secret-password-123