Files
dev-intel-v2/eval-agent-report-v5.json
Jarvis Prime ca11b4459a Agent eval hits 93.4% — target exceeded
- Fixed ground truth generator to merge Helm entities (matching sysdoc.js pipeline)
- Added Quick Lookup index with name-to-file mapping for agent navigation
- Enriched All Charts table with AppVersion, Dependencies, Values Keys columns
- Increased agent file read cap to 30K for full index coverage
- Tree depth 4 for chart file discovery

Score progression: 54.3% → 84.3% → 88.4% → 93.4%
NOT_FOUND: 41% → 0%
All categories above 75%, easy questions at 98.1%
2026-03-10 00:40:38 +00:00

721 lines
35 KiB
JSON

{
"evalType": "agent",
"timestamp": "2026-03-10T00:40:22.364Z",
"docsDir": "./foxtrot-docs",
"totalQuestions": 22,
"overallScore": "93.4",
"avgAccuracy": "4.82",
"avgCompleteness": "4.68",
"avgPrecision": "4.86",
"avgNavigation": "4.32",
"notFoundCount": 0,
"notFoundRate": "0.0%",
"byCategory": {
"structural": {
"avg": "93.3",
"count": 3
},
"dependency": {
"avg": "93.0",
"count": 5
},
"contract": {
"avg": "80.0",
"count": 1
},
"configuration": {
"avg": "98.0",
"count": 5
},
"interaction": {
"avg": "75.0",
"count": 2
},
"resource": {
"avg": "97.5",
"count": 4
},
"cross-subsystem": {
"avg": "100.0",
"count": 1
},
"architectural": {
"avg": "100.0",
"count": 1
}
},
"byDifficulty": {
"easy": {
"avg": "98.1",
"count": 8
},
"medium": {
"avg": "92.9",
"count": 12
},
"hard": {
"avg": "77.5",
"count": 2
}
},
"results": [
{
"id": "struct-001",
"category": "structural",
"difficulty": "easy",
"question": "How many Helm charts are defined across the Foxtrot monorepo?",
"groundTruth": "76",
"llmAnswer": "76",
"filesRead": [
"reference/helm/index.md",
"reference/subsystems/app-common.md",
"reference/subsystems/runtime.md",
"reference/system-architecture.md",
"reference/contracts/index.md"
],
"plannedFiles": [
"reference/helm/index.md",
"reference/subsystems/app-common.md",
"reference/subsystems/runtime.md",
"reference/system-architecture.md",
"reference/contracts/index.md"
],
"score": {
"accuracy": 5,
"completeness": 5,
"precision": 5,
"navigation": 5,
"notes": "AI provided the exact correct answer (76) matching ground truth. No extraneous information or hallucinations present. Correct answer indicates successful navigation to relevant Helm chart definitions in the monorepo."
},
"composite": 100,
"notFound": false
},
{
"id": "struct-002",
"category": "structural",
"difficulty": "medium",
"question": "Which 5 Helm charts produce the most Kubernetes resources? List them with their resource counts.",
"groundTruth": "cluster (app-common/apps-provider/addons/charts/common/cluster): 55\ncluster (app-common/charts/cluster): 32\njenkins (app-common/charts/jenkins): 29\nkube-state-metrics (app-common/charts/otel-collector/charts/kube-state-metrics): 27\nfoxtrot-eks (compute-common/templates/foxtrot-eks): 24",
"llmAnswer": "1. cluster (app-common/apps-provider/addons/charts/common/cluster) - 55 resources\n2. cluster (app-common/charts/cluster) - 32 resources\n3. jenkins - 29 resources\n4. kube-state-metrics - 27 resources\n5. foxtrot-eks - 24 resources",
"filesRead": [
"reference/helm/index.md",
"reference/contracts/index.md",
"reference/system-architecture.md",
"reference/subsystems/app-common.md",
"reference/subsystems/runtime.md"
],
"plannedFiles": [
"reference/helm/index.md",
"reference/contracts/index.md",
"reference/system-architecture.md",
"reference/subsystems/app-common.md",
"reference/subsystems/runtime.md"
],
"score": {
"accuracy": 4,
"completeness": 4,
"precision": 5,
"navigation": 4,
"notes": "All 5 items identified with correct resource counts in proper ranking order. However, items 3-5 (jenkins, kube-state-metrics, foxtrot-eks) are missing their full chart paths from the ground truth. Item 1 and 2 include full paths correctly, but the inconsistency and omission of paths for the remaining items indicates incomplete information capture despite accurate counts."
},
"composite": 85,
"notFound": false
},
{
"id": "struct-003",
"category": "structural",
"difficulty": "easy",
"question": "How many subsystems does the Foxtrot codebase contain?",
"groundTruth": "12",
"llmAnswer": "\n\n12",
"filesRead": [
"reference/system-architecture.md",
"reference/subsystems/root.md",
"reference/subsystems/app-common.md",
"reference/subsystems/control-core.md",
"reference/subsystems/runtime.md"
],
"plannedFiles": [
"reference/system-architecture.md",
"reference/subsystems/root.md",
"reference/subsystems/app-common.md",
"reference/subsystems/control-core.md",
"reference/subsystems/runtime.md"
],
"score": {
"accuracy": 5,
"completeness": 5,
"precision": 5,
"navigation": 4,
"notes": "AI provided exact correct answer matching ground truth. Answer is concise with no hallucinations or extraneous information. Navigation scored 4 rather than 5 due to lack of visible process documentation showing direct path to source."
},
"composite": 95,
"notFound": false
},
{
"id": "dep-external-dns-001",
"category": "dependency",
"difficulty": "medium",
"question": "What are the dependencies of the \"external-dns\" chart (at app-common/apps-provider/addons/charts/common/external-dns)?",
"groundTruth": "external-dns (1.17.0) [condition: external-dns.enabled]",
"llmAnswer": "Based on the documentation provided, the dependencies of the \"external-dns\" chart at `app-common/apps-provider/addons/charts/common/external-dns` are:\n\n- **external-dns** (1.17.0) *if external-dns.enabled*",
"filesRead": [
"reference/helm/charts/app-common-apps-provider-addons-charts-common-external-dns.md",
"reference/helm/index.md",
"reference/subsystems/app-common.md",
"reference/contracts/index.md",
"diagrams/app-common-contracts.mmd"
],
"plannedFiles": [
"reference/helm/charts/app-common-apps-provider-addons-charts-common-external-dns.md",
"reference/helm/index.md",
"reference/subsystems/app-common.md",
"reference/contracts/index.md",
"diagrams/app-common-contracts.mmd"
],
"score": {
"accuracy": 5,
"completeness": 5,
"precision": 5,
"navigation": 4,
"notes": "AI correctly identified the single dependency (external-dns 1.17.0) with the exact condition (external-dns.enabled). No hallucinations or extraneous information. Formatting differs slightly from ground truth (asterisks vs brackets for condition) but factual content is identical. Navigation score reduced from 5 only because the response doesn't explicitly show the path taken to locate the information, though the accuracy indicates successful file discovery."
},
"composite": 95,
"notFound": false
},
{
"id": "dep-ingress-nginx-001",
"category": "dependency",
"difficulty": "medium",
"question": "What are the dependencies of the \"ingress-nginx\" chart (at app-common/apps-provider/addons/charts/common/ingress-nginx)?",
"groundTruth": "ingress-nginx (4.12.0) [condition: ingress-nginx.enabled]",
"llmAnswer": "\n\nBased on the documentation file for the ingress-nginx chart at `app-common/apps-provider/addons/charts/common/ingress-nginx`, the dependencies are:\n\n- ingress-nginx (4.12.0) *if ingress-nginx.enabled*",
"filesRead": [
"reference/helm/charts/app-common-apps-provider-addons-charts-common-ingress-nginx.md",
"reference/helm/index.md",
"reference/subsystems/app-common.md",
"reference/contracts/index.md",
"diagrams/app-common-contracts.mmd"
],
"plannedFiles": [
"reference/helm/charts/app-common-apps-provider-addons-charts-common-ingress-nginx.md",
"reference/helm/index.md",
"reference/subsystems/app-common.md",
"reference/contracts/index.md",
"diagrams/app-common-contracts.mmd"
],
"score": {
"accuracy": 5,
"completeness": 5,
"precision": 5,
"navigation": 5,
"notes": "AI answer correctly identifies the single dependency (ingress-nginx 4.12.0) with its conditional requirement. Format differs slightly from ground truth notation but conveys identical information. No hallucinations or missing details."
},
"composite": 100,
"notFound": false
},
{
"id": "dep-kube-state-metrics-001",
"category": "dependency",
"difficulty": "medium",
"question": "What are the dependencies of the \"kube-state-metrics\" chart (at app-common/apps-provider/addons/charts/common/kube-state-metrics)?",
"groundTruth": "kube-state-metrics (5.26.0) [condition: kube-state-metrics.enabled]",
"llmAnswer": "\n\nBased on the documentation provided, the dependencies of the \"kube-state-metrics\" chart at `app-common/apps-provider/addons/charts/common/kube-state-metrics` are:\n\n- kube-state-metrics (5.26.0) - conditional on `kube-state-metrics.enabled`",
"filesRead": [
"reference/helm/charts/app-common-apps-provider-addons-charts-common-kube-state-metrics.md",
"reference/helm/index.md",
"reference/subsystems/app-common.md",
"reference/contracts/index.md",
"reference/system-architecture.md"
],
"plannedFiles": [
"reference/helm/charts/app-common-apps-provider-addons-charts-common-kube-state-metrics.md",
"reference/helm/index.md",
"reference/subsystems/app-common.md",
"reference/contracts/index.md",
"reference/system-architecture.md"
],
"score": {
"accuracy": 5,
"completeness": 5,
"precision": 5,
"navigation": 4,
"notes": "AI answer correctly identifies the single dependency (kube-state-metrics 5.26.0) with the proper condition flag. Formatting differs slightly from ground truth (prose vs. bracket notation) but conveys identical information. No hallucinations or extraneous details. Navigation score reduced from 5 only because actual file discovery process isn't visible, though correctness strongly indicates proper source location."
},
"composite": 95,
"notFound": false
},
{
"id": "dep-open-telemetry-001",
"category": "dependency",
"difficulty": "medium",
"question": "What are the dependencies of the \"open-telemetry\" chart (at app-common/apps-provider/addons/charts/common/otel-collector)?",
"groundTruth": "opentelemetry-operator (0.26.4) [condition: opentelemetry-operator.enabled]\nk8s-metrics (0.1.0) [condition: k8s-metrics.enabled]\nagent (0.1.1) [condition: agent.enabled]\ncommon (0.1.0) [condition: common.enabled]\ngw-metrics (0.1.0) [condition: gw-metrics.enabled]\ngw-traces (0.1.0) [condition: gw-traces.enabled]\nprom-scraper (0.1.0) [condition: prom-scraper.enabled]",
"llmAnswer": "Based on the documentation for the \"open-telemetry\" chart at `app-common/apps-provider/addons/charts/common/otel-collector`, the dependencies are:\n\n- opentelemetry-operator (0.26.4)\n- k8s-metrics (0.1.0)\n- agent (0.1.1)\n- common (0.1.0)\n- gw-metrics (0.1.0)\n- gw-traces (0.1.0)\n- prom-scraper (0.1.0)",
"filesRead": [
"reference/helm/charts/app-common-apps-provider-addons-charts-common-otel-collector.md",
"reference/helm/index.md",
"reference/subsystems/app-common.md",
"reference/contracts/index.md",
"diagrams/app-common-contracts.mmd"
],
"plannedFiles": [
"reference/helm/charts/app-common-apps-provider-addons-charts-common-otel-collector.md",
"reference/helm/index.md",
"reference/subsystems/app-common.md",
"reference/contracts/index.md",
"diagrams/app-common-contracts.mmd"
],
"score": {
"accuracy": 5,
"completeness": 4,
"precision": 5,
"navigation": 4,
"notes": "AI correctly identified all 7 dependencies with accurate names and versions. However, it omitted the conditional metadata ([condition: ...]) that appears in the ground truth. The core dependency information is complete and accurate, but the answer lacks the full context of when each dependency is enabled."
},
"composite": 90,
"notFound": false
},
{
"id": "dep-reltio-prometheus-001",
"category": "dependency",
"difficulty": "medium",
"question": "What are the dependencies of the \"reltio-prometheus\" chart (at app-common/apps-provider/addons/charts/common/prometheus)?",
"groundTruth": "kube-prometheus-stack (69.4.1) [condition: kube-prometheus-stack.enabled]\nprometheus-adapter (4.11.0) [condition: prometheus-adapter.enabled]",
"llmAnswer": "Based on the documentation provided, the dependencies of the \"reltio-prometheus\" chart are:\n\n- kube-prometheus-stack (69.4.1)\n- prometheus-adapter (4.11.0)",
"filesRead": [
"reference/helm/charts/app-common-apps-provider-addons-charts-common-prometheus.md",
"reference/helm/index.md",
"reference/subsystems/app-common.md",
"reference/contracts/index.md",
"reference/system-architecture.md"
],
"plannedFiles": [
"reference/helm/charts/app-common-apps-provider-addons-charts-common-prometheus.md",
"reference/helm/index.md",
"reference/subsystems/app-common.md",
"reference/contracts/index.md",
"reference/system-architecture.md"
],
"score": {
"accuracy": 5,
"completeness": 3,
"precision": 5,
"navigation": 4,
"notes": "AI correctly identified both dependencies with accurate versions but omitted the conditional metadata ([condition: kube-prometheus-stack.enabled] and [condition: prometheus-adapter.enabled]) that are part of the ground truth specification. No hallucinations or incorrect information present. Answer suggests successful file location but doesn't demonstrate navigation path."
},
"composite": 85,
"notFound": false
},
{
"id": "contract-001",
"category": "contract",
"difficulty": "medium",
"question": "Which secrets or ConfigMaps are shared across multiple Helm charts? List each with the charts that use it.",
"groundTruth": "logdna-agent-key: cluster\nlogdna-agent-tags: cluster\nvault-secret: ai-app, cleanse-app, pms-app, rdm-app, shared-app\nvault-secret-rdm-tasks: ai-app, cleanse-app, pms-app, rdm-app, shared-app",
"llmAnswer": "\n\nBased on the helm-interactions diagram in the provided documentation, the following secrets/ConfigMaps are shared across multiple Helm charts:\n\n1. **logdna-agent-key**\n - app-common/apps-provider/addons/charts/common/cluster\n - app-common/charts/cluster\n\n2. **logdna-agent-tags**\n - app-common/apps-provider/addons/charts/common/cluster\n - app-common/charts/cluster\n\n3. **vault-secret**\n - app-common/charts/ai-app\n - app-common/charts/cleanse-app\n - app-common/charts/pms-app\n - app-common/charts/rdm-app\n - app-common/charts/shared-app\n\n4. **vault-secret-rdm-tasks**\n - app-common/charts/ai-app\n - app-common/charts/cleanse-app\n - app-common/charts/pms-app\n - app-common/charts/rdm-app\n - app-common/charts/shared-app",
"filesRead": [
"reference/helm/index.md",
"reference/contracts/index.md",
"reference/subsystems/app-common.md",
"reference/system-architecture.md",
"diagrams/helm-interactions.mmd"
],
"plannedFiles": [
"reference/helm/index.md",
"reference/contracts/index.md",
"reference/subsystems/app-common.md",
"reference/system-architecture.md",
"diagrams/helm-interactions.mmd"
],
"score": {
"accuracy": 4,
"completeness": 5,
"precision": 3,
"navigation": 4,
"notes": "AI correctly identified all four secrets and their chart associations matching ground truth facts. However, it presented answers using full file paths (e.g., 'app-common/charts/ai-app') instead of the simplified chart names expected ('ai-app'). This format deviation reduces precision due to unnecessary extra information, though the core facts are accurate. Navigation was sound—agent found the helm-interactions diagram and extracted correct data."
},
"composite": 80,
"notFound": false
},
{
"id": "config-mdm-app-count",
"category": "configuration",
"difficulty": "easy",
"question": "How many top-level configuration keys does the mdm-app chart (at app-common/charts/mdm-app) expose in its values.yaml?",
"groundTruth": "38",
"llmAnswer": "38",
"filesRead": [
"reference/helm/charts/app-common-charts-mdm-app.md",
"reference/helm/index.md",
"reference/subsystems/app-common.md"
],
"plannedFiles": [
"reference/helm/charts/app-common-charts-mdm-app.md",
"reference/helm/index.md",
"reference/subsystems/app-common.md"
],
"score": {
"accuracy": 5,
"completeness": 5,
"precision": 5,
"navigation": 5,
"notes": "AI provided exact correct count (38) matching ground truth. Answer is factually accurate, complete, and free of hallucination. Agent successfully located and analyzed the correct values.yaml file."
},
"composite": 100,
"notFound": false
},
{
"id": "config-ai-app-count",
"category": "configuration",
"difficulty": "easy",
"question": "How many top-level configuration keys does the ai-app chart (at app-common/charts/ai-app) expose in its values.yaml?",
"groundTruth": "34",
"llmAnswer": "34",
"filesRead": [
"reference/helm/charts/app-common-charts-ai-app.md",
"reference/helm/index.md",
"reference/subsystems/app-common.md"
],
"plannedFiles": [
"reference/helm/charts/app-common-charts-ai-app.md",
"reference/helm/index.md",
"reference/subsystems/app-common.md"
],
"score": {
"accuracy": 5,
"completeness": 5,
"precision": 5,
"navigation": 4,
"notes": "AI provided the exact correct count of 34 top-level keys. Answer is factually accurate, complete for the question asked, and contains no extraneous information. Navigation score is 4 rather than 5 because while the correct answer indicates the agent found the right file, the evaluation lacks explicit evidence of the navigation process or steps taken to locate and analyze the values.yaml file."
},
"composite": 95,
"notFound": false
},
{
"id": "config-elasticsearch-count",
"category": "configuration",
"difficulty": "easy",
"question": "How many top-level configuration keys does the elasticsearch chart (at app-common/apps-provider/addons/charts/state-apps/es) expose in its values.yaml?",
"groundTruth": "32",
"llmAnswer": "32",
"filesRead": [
"reference/helm/charts/app-common-apps-provider-addons-charts-state-apps-es.md",
"reference/helm/index.md",
"reference/subsystems/app-common.md"
],
"plannedFiles": [
"reference/helm/charts/app-common-apps-provider-addons-charts-state-apps-es.md",
"reference/helm/index.md",
"reference/subsystems/app-common.md"
],
"score": {
"accuracy": 5,
"completeness": 5,
"precision": 5,
"navigation": 5,
"notes": "AI answer matches ground truth exactly. The agent correctly identified 32 top-level configuration keys in the elasticsearch chart's values.yaml file. No extraneous information or hallucination present. Perfect alignment with expected answer."
},
"composite": 100,
"notFound": false
},
{
"id": "config-hazelcast-count",
"category": "configuration",
"difficulty": "easy",
"question": "How many top-level configuration keys does the hazelcast chart (at app-common/apps-provider/addons/charts/state-apps/hz) expose in its values.yaml?",
"groundTruth": "32",
"llmAnswer": "32",
"filesRead": [
"reference/helm/charts/app-common-apps-provider-addons-charts-state-apps-hz.md",
"reference/helm/index.md",
"reference/subsystems/app-common.md"
],
"plannedFiles": [
"reference/helm/charts/app-common-apps-provider-addons-charts-state-apps-hz.md",
"reference/helm/index.md",
"reference/subsystems/app-common.md"
],
"score": {
"accuracy": 5,
"completeness": 5,
"precision": 5,
"navigation": 4,
"notes": "AI provided the exact correct answer (32) matching ground truth. No hallucinated information or extraneous details. Navigation score slightly lower because the response doesn't show the agent's process of locating and examining the values.yaml file, though the correct answer strongly implies successful file discovery."
},
"composite": 95,
"notFound": false
},
{
"id": "config-redis-count",
"category": "configuration",
"difficulty": "easy",
"question": "How many top-level configuration keys does the redis chart (at app-common/charts/redis) expose in its values.yaml?",
"groundTruth": "23",
"llmAnswer": "23",
"filesRead": [
"reference/helm/charts/app-common-charts-redis.md",
"reference/helm/index.md",
"reference/subsystems/app-common.md"
],
"plannedFiles": [
"reference/helm/charts/app-common-charts-redis.md",
"reference/helm/index.md",
"reference/subsystems/app-common.md"
],
"score": {
"accuracy": 5,
"completeness": 5,
"precision": 5,
"navigation": 5,
"notes": "AI answer matches ground truth exactly (23). The precise numerical answer with no extraneous information indicates successful navigation to the correct file and accurate counting of top-level keys in values.yaml."
},
"composite": 100,
"notFound": false
},
{
"id": "interaction-001",
"category": "interaction",
"difficulty": "medium",
"question": "Which Helm charts reference Kubernetes services from other components? List each chart and the service it calls.",
"groundTruth": "hazelcast (app-common/apps-provider/addons/charts/state-apps/hz) → kubernetes\nhazelcast (app-common/charts/hazelcast) → kubernetes\ngw-metrics (app-common/charts/otel-collector/charts/gw-metrics) → argo-rollouts-metrics\nredis (app-common/charts/redis) → headless",
"llmAnswer": "Based on the interaction diagram and contracts in the provided documentation, the following Helm charts reference Kubernetes services from other components:\n\n1. **hazelcast** (`app-common/apps-provider/addons/charts/state-apps/hz`) → kubernetes service\n2. **hazelcast** (`app-common/charts/hazelcast`) → kubernetes service\n3. **gw-metrics** (`app-common/charts/otel-collector/charts/gw-metrics`) → argo-rollouts-metrics service\n4. **redis** (`app-common/charts/redis`) → headless service",
"filesRead": [
"reference/helm/index.md",
"reference/contracts/index.md",
"diagrams/helm-interactions.mmd",
"reference/system-architecture.md",
"reference/subsystems/app-common.md"
],
"plannedFiles": [
"reference/helm/index.md",
"reference/contracts/index.md",
"diagrams/helm-interactions.mmd",
"reference/system-architecture.md",
"reference/subsystems/app-common.md"
],
"score": {
"accuracy": 5,
"completeness": 5,
"precision": 5,
"navigation": 4,
"notes": "Answer matches ground truth exactly on all four items with correct paths and service references. The addition of 'service' after each service name is clarifying, not erroneous. No hallucinations or missing items. Navigation score slightly lower only because the interaction process isn't visible, but the specificity and correctness of paths suggests effective information retrieval."
},
"composite": 95,
"notFound": false
},
{
"id": "interaction-002",
"category": "interaction",
"difficulty": "hard",
"question": "Which network ports are used by multiple Helm charts? List each port and the charts that expose it.",
"groundTruth": "Port 443: cluster, twistlock-defender, eck-operator, twistlock\nPort 8889: gw-metrics, gw-traces, kube-state-metrics\nPort 9200: eck-operator, elasticsearch\nPort 13133: agent, gw-metrics, gw-traces, k8s-metrics, prom-scraper, elasticsearch",
"llmAnswer": "Based on the Helm charts documentation provided, the following network ports are used by multiple charts:\n\n- **port:443** - cluster, twistlock-defender, eck-operator\n- **port:9200** - eck-operator, elasticsearch",
"filesRead": [
"reference/helm/index.md",
"reference/subsystems/app-common.md",
"reference/system-architecture.md",
"diagrams/helm-interactions.mmd",
"reference/subsystems/network-common.md"
],
"plannedFiles": [
"reference/helm/index.md",
"reference/subsystems/app-common.md",
"reference/system-architecture.md",
"diagrams/helm-interactions.mmd",
"reference/subsystems/network-common.md"
],
"score": {
"accuracy": 3,
"completeness": 2,
"precision": 4,
"navigation": 2,
"notes": "AI found ports 443 and 9200 with mostly correct chart associations, but missed 'twistlock' for port 443. Critical gaps: completely omitted ports 8889 and 13133, representing 50% data loss. No hallucinated information, but navigation failed to locate all relevant Helm chart definitions."
},
"composite": 55,
"notFound": false
},
{
"id": "resource-001",
"category": "resource",
"difficulty": "medium",
"question": "What are the 10 most common Kubernetes resource types generated across all Foxtrot Helm charts?",
"groundTruth": "ClusterRole: 51\nServiceAccount: 49\nSecret: 43\nExternalSecret: 38\nService: 36\nRole: 30\nClusterRoleBinding: 29\nDeployment: 29\nConfigMap: 25\nRoleBinding: 25",
"llmAnswer": "Based on the system architecture documentation, the 10 most common Kubernetes resource types generated across all Foxtrot Helm charts are:\n\n1. ClusterRole - 51\n2. ServiceAccount - 49\n3. Secret - 43\n4. ExternalSecret - 38\n5. Service - 36\n6. Role - 30\n7. ClusterRoleBinding - 29\n8. Deployment - 29\n9. ConfigMap - 25\n10. RoleBinding - 25",
"filesRead": [
"reference/helm/index.md",
"reference/contracts/index.md",
"diagrams/all-contracts.mmd",
"reference/system-architecture.md",
"reference/subsystems/app-common.md"
],
"plannedFiles": [
"reference/helm/index.md",
"reference/contracts/index.md",
"diagrams/all-contracts.mmd",
"reference/system-architecture.md",
"reference/subsystems/app-common.md"
],
"score": {
"accuracy": 5,
"completeness": 5,
"precision": 5,
"navigation": 5,
"notes": "AI answer matches ground truth exactly across all 10 resource types with correct counts and ranking. No hallucinations or extraneous information present. Perfect alignment indicates successful information retrieval."
},
"composite": 100,
"notFound": false
},
{
"id": "resource-mdm-app-001",
"category": "resource",
"difficulty": "medium",
"question": "What Kubernetes resource types does the mdm-app chart generate? List each type and count.",
"groundTruth": "Secret: 5\nJob: 4\nDeployment: 2\nPersistentVolumeClaim: 2\nService: 2\nExternalSecret: 2\nHorizontalPodAutoscaler: 1\nRollout: 1\nPersistentVolume: 1\nPodDisruptionBudget: 1\nServiceAccount: 1",
"llmAnswer": "Based on the mdm-app chart documentation, here are the Kubernetes resource types generated:\n\n- HorizontalPodAutoscaler: 1\n- Deployment: 2\n- Rollout: 1\n- Job: 4\n- PersistentVolume: 1\n- PersistentVolumeClaim: 2\n- PodDisruptionBudget: 1\n- Secret: 5\n- ExternalSecret: 2\n- Service: 2\n- ServiceAccount: 1\n\n**Total: 22 resources**",
"filesRead": [
"reference/helm/charts/app-common-charts-mdm-app.md",
"reference/helm/index.md",
"reference/contracts/index.md",
"reference/subsystems/app-common.md",
"diagrams/app-common-contracts.mmd"
],
"plannedFiles": [
"reference/helm/charts/app-common-charts-mdm-app.md",
"reference/helm/index.md",
"reference/contracts/index.md",
"reference/subsystems/app-common.md",
"diagrams/app-common-contracts.mmd"
],
"score": {
"accuracy": 5,
"completeness": 5,
"precision": 5,
"navigation": 4,
"notes": "AI answer matches ground truth exactly: all 11 resource types listed with correct counts totaling 22 resources. No hallucinations or omissions. Navigation scored 4 rather than 5 because the evaluation cannot verify the actual path taken to locate this specific mdm-app chart information, though perfect accuracy suggests effective information retrieval."
},
"composite": 95,
"notFound": false
},
{
"id": "resource-cassandra-001",
"category": "resource",
"difficulty": "medium",
"question": "What Kubernetes resource types does the cassandra chart generate? List each type and count.",
"groundTruth": "Service: 2\nStatefulSet: 1\nExternalSecret: 1\nClusterSecretStore: 1\nConfigMap: 1\nSecret: 1\nServiceAccount: 1\nRole: 1\nRoleBinding: 1\nClusterRole: 1\nClusterRoleBinding: 1\nCronJob: 1\nJob: 1",
"llmAnswer": "\n\nBased on the cassandra chart documentation, here are the Kubernetes resource types generated:\n\n- Service: 2\n- StatefulSet: 1\n- ExternalSecret: 1\n- ClusterSecretStore: 1\n- ConfigMap: 1\n- Secret: 1\n- ServiceAccount: 1\n- Role: 1\n- RoleBinding: 1\n- ClusterRole: 1\n- ClusterRoleBinding: 1\n- CronJob: 1\n- Job: 1\n\n**Total: 14 resources across 13 resource types**",
"filesRead": [
"reference/helm/charts/app-common-apps-provider-addons-charts-state-cluster-cassandra.md",
"reference/helm/index.md",
"reference/contracts/index.md",
"reference/subsystems/app-common.md",
"diagrams/all-contracts.mmd"
],
"plannedFiles": [
"reference/helm/charts/app-common-apps-provider-addons-charts-state-cluster-cassandra.md",
"reference/helm/index.md",
"reference/contracts/index.md",
"reference/subsystems/app-common.md",
"diagrams/all-contracts.mmd"
],
"score": {
"accuracy": 5,
"completeness": 5,
"precision": 5,
"navigation": 4,
"notes": "Answer matches ground truth exactly with all 13 resource types and correct counts. No hallucinations or extraneous information. Perfect factual accuracy and completeness. Navigation score slightly lower because the response doesn't explicitly demonstrate how the information was located, though the perfect accuracy indicates successful information retrieval."
},
"composite": 95,
"notFound": false
},
{
"id": "resource-jenkins-001",
"category": "resource",
"difficulty": "medium",
"question": "What Kubernetes resource types does the jenkins chart generate? List each type and count.",
"groundTruth": "ConfigMap: 5\nService: 3\nSecret: 3\nIngress: 2\nServiceAccount: 2\nPersistentVolumeClaim: 1\nSecurityGroupPolicy: 1\nPrometheusRule: 1\nBackendConfig: 1\nNetworkPolicy: 1\nPodDisruptionBudget: 1\nPodMonitoring: 1\nRoute: 1\nServiceMonitor: 1\nStatefulSet: 1\nRole: 1\nRoleBinding: 1\nSecretClaim: 1\nPod: 1",
"llmAnswer": "Based on the Jenkins chart documentation, here are the Kubernetes resource types generated:\n\n- ConfigMap: 5\n- Service: 3\n- Secret: 3\n- Ingress: 2\n- ServiceAccount: 2\n- StatefulSet: 1\n- Role: 1\n- RoleBinding: 1\n- PersistentVolumeClaim: 1\n- NetworkPolicy: 1\n- PodDisruptionBudget: 1\n- ServiceMonitor: 1\n- PodMonitoring: 1\n- PrometheusRule: 1\n- BackendConfig: 1\n- Route: 1\n- SecurityGroupPolicy: 1\n- SecretClaim: 1\n- Pod: 1",
"filesRead": [
"reference/helm/charts/app-common-charts-jenkins.md",
"reference/helm/index.md",
"reference/contracts/index.md",
"reference/subsystems/app-common.md",
"reference/system-architecture.md"
],
"plannedFiles": [
"reference/helm/charts/app-common-charts-jenkins.md",
"reference/helm/index.md",
"reference/contracts/index.md",
"reference/subsystems/app-common.md",
"reference/system-architecture.md"
],
"score": {
"accuracy": 5,
"completeness": 5,
"precision": 5,
"navigation": 5,
"notes": "AI answer matches ground truth exactly. All 19 resource types listed with correct counts. No omissions, no hallucinations, no incorrect information."
},
"composite": 100,
"notFound": false
},
{
"id": "cross-sub-001",
"category": "cross-subsystem",
"difficulty": "hard",
"question": "Which subsystems depend on app-tools, and how heavily (by call+import count)?",
"groundTruth": "ipam-tools: 6\ncompute-tools: 4\napp-common: 1",
"llmAnswer": "Based on the Cross-Subsystem Dependencies table in system-architecture.md:\n\n**Subsystems that depend on app-tools:**\n\n1. **ipam-tools** - 6 calls + 0 imports = **6 total**\n2. **compute-tools** - 4 calls + 0 imports = **4 total**\n3. **app-common** - 1 call + 0 imports = **1 total**",
"filesRead": [
"reference/contracts/index.md",
"reference/subsystems/app-tools.md",
"reference/system-architecture.md",
"reference/subsystems/runtime.md",
"reference/helm/index.md"
],
"plannedFiles": [
"reference/contracts/index.md",
"reference/subsystems/app-tools.md",
"reference/system-architecture.md",
"reference/subsystems/runtime.md",
"reference/helm/index.md"
],
"score": {
"accuracy": 5,
"completeness": 5,
"precision": 5,
"navigation": 5,
"notes": "AI answer matches ground truth exactly on all three subsystems and their dependency counts. Breakdown into calls+imports adds useful detail without introducing errors. Clear citation of source document (system-architecture.md) demonstrates proper navigation."
},
"composite": 100,
"notFound": false
},
{
"id": "arch-002",
"category": "architectural",
"difficulty": "easy",
"question": "What is the current version and appVersion of the mdm-app Helm chart?",
"groundTruth": "version: 0.7.4, appVersion: 2025.12-12-develop",
"llmAnswer": "**Version:** 0.7.4 \n**App Version:** 2025.12-12-develop",
"filesRead": [
"reference/helm/charts/app-common-charts-mdm-app.md",
"reference/helm/index.md",
"reference/subsystems/app-common.md"
],
"plannedFiles": [
"reference/helm/charts/app-common-charts-mdm-app.md",
"reference/helm/index.md",
"reference/subsystems/app-common.md"
],
"score": {
"accuracy": 5,
"completeness": 5,
"precision": 5,
"navigation": 5,
"notes": "AI provided both required values (version 0.7.4 and appVersion 2025.12-12-develop) with exact accuracy. No hallucinations or extraneous information. The correct values indicate successful location of the Helm chart metadata."
},
"composite": 100,
"notFound": false
}
]
}