Phase 9c: Split eval into Agent (file-browsing) and Human (readability) tracks

Agent eval: 54.3% (22 questions, 40.9% NOT_FOUND) Human eval: 63.9% (28 questions, 17.9% NOT_FOUND) Key findings: - Agent navigation is the bottleneck (2.09/5) — long path-based filenames hurt discoverability - Human findability is decent (3.46/5) but dependency questions fail (0%) because chart docs for wrapper charts don't surface their sub-chart deps - Both tracks show strong precision (4.4+/5) — very low hallucination - Resources (91%) and interactions (95%) score great for humans - Configuration and contracts are solid across both tracks
2026-03-09 23:55:54 +00:00
parent 0cc4abcb0f
commit 304f0a9e9f
4 changed files with 2050 additions and 0 deletions
--- a/eval-agent-report.json
+++ b/eval-agent-report.json
@@ -0,0 +1,719 @@
+{
+  "evalType": "agent",
+  "timestamp": "2026-03-09T23:49:59.481Z",
+  "docsDir": "./foxtrot-docs",
+  "totalQuestions": 22,
+  "overallScore": "54.3",
+  "avgAccuracy": "2.27",
+  "avgCompleteness": "2.09",
+  "avgPrecision": "4.41",
+  "avgNavigation": "2.09",
+  "notFoundCount": 9,
+  "notFoundRate": "40.9%",
+  "byCategory": {
+    "structural": {
+      "avg": "68.3",
+      "count": 3
+    },
+    "dependency": {
+      "avg": "38.0",
+      "count": 5
+    },
+    "contract": {
+      "avg": "75.0",
+      "count": 1
+    },
+    "configuration": {
+      "avg": "55.0",
+      "count": 5
+    },
+    "interaction": {
+      "avg": "62.5",
+      "count": 2
+    },
+    "resource": {
+      "avg": "50.0",
+      "count": 4
+    },
+    "cross-subsystem": {
+      "avg": "100.0",
+      "count": 1
+    },
+    "architectural": {
+      "avg": "25.0",
+      "count": 1
+    }
+  },
+  "byDifficulty": {
+    "easy": {
+      "avg": "51.3",
+      "count": 8
+    },
+    "medium": {
+      "avg": "54.6",
+      "count": 12
+    },
+    "hard": {
+      "avg": "65.0",
+      "count": 2
+    }
+  },
+  "results": [
+    {
+      "id": "struct-001",
+      "category": "structural",
+      "difficulty": "easy",
+      "question": "How many Helm charts are defined across the Foxtrot monorepo?",
+      "groundTruth": "76",
+      "llmAnswer": "76",
+      "filesRead": [
+        "reference/helm/index.md",
+        "reference/system-architecture.md",
+        "reference/contracts/index.md"
+      ],
+      "plannedFiles": [
+        "reference/helm/index.md",
+        "reference/system-architecture.md",
+        "reference/contracts/index.md"
+      ],
+      "score": {
+        "accuracy": 5,
+        "completeness": 5,
+        "precision": 5,
+        "navigation": 5,
+        "notes": "AI provided the exact correct answer (76) matching ground truth. No extraneous information or hallucinations present. Correct answer indicates successful navigation to relevant Helm chart definitions in the monorepo."
+      },
+      "composite": 100,
+      "notFound": false
+    },
+    {
+      "id": "struct-002",
+      "category": "structural",
+      "difficulty": "medium",
+      "question": "Which 5 Helm charts produce the most Kubernetes resources? List them with their resource counts.",
+      "groundTruth": "cluster (app-common/apps-provider/addons/charts/common/cluster): 55\ncluster (app-common/charts/cluster): 32\njenkins (app-common/charts/jenkins): 29\nkube-state-metrics (app-common/charts/otel-collector/charts/kube-state-metrics): 27\nfoxtrot-eks (compute-common/templates/foxtrot-eks): 24",
+      "llmAnswer": "1. cluster (`app-common/apps-provider/addons/charts/common/cluster`) - 55 resources\n2. cluster (`app-common/charts/cluster`) - 32 resources\n3. jenkins (`app-common/charts/jenkins`) - 29 resources\n4. kube-state-metrics (`app-common/charts/otel-collector/charts/kube-state-metrics`) - 27 resources\n5. foxtrot-eks (`compute-common/templates/foxtrot-eks`) - 24 resources",
+      "filesRead": [
+        "reference/helm/index.md",
+        "reference/contracts/index.md",
+        "reference/system-architecture.md"
+      ],
+      "plannedFiles": [
+        "reference/helm/index.md",
+        "reference/contracts/index.md",
+        "reference/system-architecture.md"
+      ],
+      "score": {
+        "accuracy": 5,
+        "completeness": 5,
+        "precision": 5,
+        "navigation": 4,
+        "notes": "AI answer matches ground truth exactly across all 5 items with correct resource counts and paths in proper ranking order. No hallucinations or extraneous information. Navigation score is 4 rather than 5 because process visibility is limited, but the specificity and accuracy of results strongly indicate effective information retrieval."
+      },
+      "composite": 95,
+      "notFound": false
+    },
+    {
+      "id": "struct-003",
+      "category": "structural",
+      "difficulty": "easy",
+      "question": "How many subsystems does the Foxtrot codebase contain?",
+      "groundTruth": "11",
+      "llmAnswer": "\n\n12",
+      "filesRead": [
+        "reference/system-architecture.md",
+        "reference/subsystems/root.md",
+        "reference/contracts/index.md",
+        "reference/helm/index.md",
+        "diagrams/all-contracts.mmd"
+      ],
+      "plannedFiles": [
+        "reference/system-architecture.md",
+        "reference/subsystems/root.md",
+        "reference/contracts/index.md",
+        "reference/helm/index.md",
+        "diagrams/all-contracts.mmd"
+      ],
+      "score": {
+        "accuracy": 0,
+        "completeness": 0,
+        "precision": 1,
+        "navigation": 1,
+        "notes": "AI provided 12 subsystems instead of the correct answer of 11. Off-by-one error suggests either miscounting during analysis or hallucination of an additional subsystem. No evidence of proper source verification or navigation to authoritative documentation."
+      },
+      "composite": 10,
+      "notFound": false
+    },
+    {
+      "id": "dep-external-dns-001",
+      "category": "dependency",
+      "difficulty": "medium",
+      "question": "What are the dependencies of the \"external-dns\" chart (at app-common/apps-provider/addons/charts/common/external-dns)?",
+      "groundTruth": "external-dns (1.17.0) [condition: external-dns.enabled]",
+      "llmAnswer": "NOT_FOUND\n\nThe documentation shows that the external-dns chart at `app-common/apps-provider/addons/charts/common/external-dns` has 1 dependency, but the specific name of that dependency is not listed in the provided files.",
+      "filesRead": [
+        "reference/helm/index.md",
+        "reference/subsystems/app-common.md",
+        "reference/contracts/index.md",
+        "diagrams/app-common-contracts.mmd",
+        "reference/system-architecture.md"
+      ],
+      "plannedFiles": [
+        "reference/helm/index.md",
+        "reference/subsystems/app-common.md",
+        "reference/contracts/index.md",
+        "diagrams/app-common-contracts.mmd",
+        "reference/system-architecture.md"
+      ],
+      "score": {
+        "accuracy": 0,
+        "completeness": 0,
+        "precision": 5,
+        "navigation": 0,
+        "notes": "AI returned NOT_FOUND when ground truth confirms 1 dependency exists: external-dns (1.17.0) with condition external-dns.enabled. Agent failed to locate the Chart.yaml or dependency information at the specified path."
+      },
+      "composite": 25,
+      "notFound": true
+    },
+    {
+      "id": "dep-ingress-nginx-001",
+      "category": "dependency",
+      "difficulty": "medium",
+      "question": "What are the dependencies of the \"ingress-nginx\" chart (at app-common/apps-provider/addons/charts/common/ingress-nginx)?",
+      "groundTruth": "ingress-nginx (4.12.0) [condition: ingress-nginx.enabled]",
+      "llmAnswer": "NOT_FOUND\n\nThe Helm Charts index shows that the ingress-nginx chart at `app-common/apps-provider/addons/charts/common/ingress-nginx` has 1 dependency, but the specific name of that dependency is not documented in the provided files.",
+      "filesRead": [
+        "reference/helm/index.md",
+        "reference/subsystems/app-common.md",
+        "reference/contracts/index.md",
+        "diagrams/app-common-contracts.mmd",
+        "reference/system-architecture.md"
+      ],
+      "plannedFiles": [
+        "reference/helm/index.md",
+        "reference/subsystems/app-common.md",
+        "reference/contracts/index.md",
+        "diagrams/app-common-contracts.mmd",
+        "reference/system-architecture.md"
+      ],
+      "score": {
+        "accuracy": 0,
+        "completeness": 0,
+        "precision": 5,
+        "navigation": 0,
+        "notes": "AI failed to locate the ingress-nginx dependency. Ground truth confirms the chart has 1 dependency: ingress-nginx (4.12.0) with condition ingress-nginx.enabled. The NOT_FOUND response indicates the agent did not successfully navigate to or parse the chart's dependency metadata."
+      },
+      "composite": 25,
+      "notFound": true
+    },
+    {
+      "id": "dep-kube-state-metrics-001",
+      "category": "dependency",
+      "difficulty": "medium",
+      "question": "What are the dependencies of the \"kube-state-metrics\" chart (at app-common/apps-provider/addons/charts/common/kube-state-metrics)?",
+      "groundTruth": "kube-state-metrics (5.26.0) [condition: kube-state-metrics.enabled]",
+      "llmAnswer": "NOT_FOUND\n\nThe documentation shows that the kube-state-metrics chart at `app-common/apps-provider/addons/charts/common/kube-state-metrics` has 1 dependency (as indicated in the Helm Charts index table), but the specific name of that dependency is not listed in the provided files.",
+      "filesRead": [
+        "reference/helm/index.md",
+        "reference/subsystems/app-common.md",
+        "reference/contracts/index.md",
+        "diagrams/app-common-contracts.mmd",
+        "reference/system-architecture.md"
+      ],
+      "plannedFiles": [
+        "reference/helm/index.md",
+        "reference/subsystems/app-common.md",
+        "reference/contracts/index.md",
+        "diagrams/app-common-contracts.mmd",
+        "reference/system-architecture.md"
+      ],
+      "score": {
+        "accuracy": 0,
+        "completeness": 0,
+        "precision": 5,
+        "navigation": 0,
+        "notes": "AI answered NOT_FOUND when the dependency clearly exists in ground truth (kube-state-metrics 5.26.0). Failed to locate or extract the documented dependency information from the chart. No hallucinations present, but complete failure to find and report the correct answer."
+      },
+      "composite": 25,
+      "notFound": true
+    },
+    {
+      "id": "dep-open-telemetry-001",
+      "category": "dependency",
+      "difficulty": "medium",
+      "question": "What are the dependencies of the \"open-telemetry\" chart (at app-common/apps-provider/addons/charts/common/otel-collector)?",
+      "groundTruth": "opentelemetry-operator (0.26.4) [condition: opentelemetry-operator.enabled]\nk8s-metrics (0.1.0) [condition: k8s-metrics.enabled]\nagent (0.1.1) [condition: agent.enabled]\ncommon (0.1.0) [condition: common.enabled]\ngw-metrics (0.1.0) [condition: gw-metrics.enabled]\ngw-traces (0.1.0) [condition: gw-traces.enabled]\nprom-scraper (0.1.0) [condition: prom-scraper.enabled]",
+      "llmAnswer": "Based on the Helm Charts table in `reference/helm/index.md`, the \"open-telemetry\" chart at `app-common/apps-provider/addons/charts/common/otel-collector` has 7 dependencies.\n\nFrom the contracts diagram, these dependencies are:\n\n1. agent\n2. common\n3. gw-metrics\n4. gw-traces\n5. k8s-metrics\n6. prom-scraper\n7. opentelemetry-operator",
+      "filesRead": [
+        "reference/helm/index.md",
+        "reference/subsystems/app-common.md",
+        "reference/contracts/index.md",
+        "diagrams/app-common-contracts.mmd",
+        "reference/system-architecture.md"
+      ],
+      "plannedFiles": [
+        "reference/helm/index.md",
+        "reference/subsystems/app-common.md",
+        "reference/contracts/index.md",
+        "diagrams/app-common-contracts.mmd",
+        "reference/system-architecture.md"
+      ],
+      "score": {
+        "accuracy": 3,
+        "completeness": 2,
+        "precision": 4,
+        "navigation": 2,
+        "notes": "AI correctly identifies all 7 dependency names but omits critical specification details: version numbers (e.g., opentelemetry-operator 0.26.4) and conditional flags (e.g., [condition: opentelemetry-operator.enabled]). Navigation references generic documentation rather than demonstrating access to the specific Chart.yaml file at the given path. Answer is partially correct but substantially incomplete."
+      },
+      "composite": 55,
+      "notFound": false
+    },
+    {
+      "id": "dep-reltio-prometheus-001",
+      "category": "dependency",
+      "difficulty": "medium",
+      "question": "What are the dependencies of the \"reltio-prometheus\" chart (at app-common/apps-provider/addons/charts/common/prometheus)?",
+      "groundTruth": "kube-prometheus-stack (69.4.1) [condition: kube-prometheus-stack.enabled]\nprometheus-adapter (4.11.0) [condition: prometheus-adapter.enabled]",
+      "llmAnswer": "Based on the Helm Charts index in the provided documentation, the \"reltio-prometheus\" chart has **2 dependencies**.\n\nFrom the contract definition shown in the system documentation, these dependencies are:\n\n1. kube-prometheus-stack\n2. prometheus-adapter",
+      "filesRead": [
+        "reference/helm/index.md",
+        "reference/subsystems/app-common.md",
+        "reference/contracts/index.md",
+        "diagrams/app-common-contracts.mmd",
+        "reference/system-architecture.md"
+      ],
+      "plannedFiles": [
+        "reference/helm/index.md",
+        "reference/subsystems/app-common.md",
+        "reference/contracts/index.md",
+        "diagrams/app-common-contracts.mmd",
+        "reference/system-architecture.md"
+      ],
+      "score": {
+        "accuracy": 3,
+        "completeness": 2,
+        "precision": 4,
+        "navigation": 3,
+        "notes": "AI correctly identified both dependency names (kube-prometheus-stack, prometheus-adapter) but omitted critical details: version numbers (69.4.1, 4.11.0) and conditions (kube-prometheus-stack.enabled, prometheus-adapter.enabled). The answer is partially correct but substantially incomplete relative to ground truth specification."
+      },
+      "composite": 60,
+      "notFound": false
+    },
+    {
+      "id": "contract-001",
+      "category": "contract",
+      "difficulty": "medium",
+      "question": "Which secrets or ConfigMaps are shared across multiple Helm charts? List each with the charts that use it.",
+      "groundTruth": "logdna-agent-key: cluster\nlogdna-agent-tags: cluster\nvault-secret: ai-app, cleanse-app, pms-app, rdm-app, shared-app\nvault-secret-rdm-tasks: ai-app, cleanse-app, pms-app, rdm-app, shared-app",
+      "llmAnswer": "Based on the interaction diagram in the provided documentation, the following secrets/ConfigMaps are shared across multiple Helm charts:\n\n1. **logdna-agent-key**\n   - cluster (app-common/apps-provider/addons/charts/common/cluster)\n   - cluster (app-common/charts/cluster)\n\n2. **logdna-agent-tags**\n   - cluster (app-common/apps-provider/addons/charts/common/cluster)\n   - cluster (app-common/charts/cluster)\n\n3. **vault-secret**\n   - ai-app (app-common/charts/ai-app)\n   - cleanse-app (app-common/charts/cleanse-app)\n   - pms-app (app-common/charts/pms-app)\n   - rdm-app (app-common/charts/rdm-app)\n   - shared-app (app-common/charts/shared-app)\n\n4. **vault-secret-rdm-tasks**\n   - ai-app (app-common/charts/ai-app)\n   - cleanse-app (app-common/charts/cleanse-app)\n   - pms-app (app-common/charts/pms-app)\n   - rdm-app (app-common/charts/rdm-app)\n   - shared-app (app-common/charts/shared-app)",
+      "filesRead": [
+        "reference/helm/index.md",
+        "reference/contracts/index.md",
+        "diagrams/helm-interactions.mmd",
+        "reference/system-architecture.md",
+        "reference/subsystems/app-common.md"
+      ],
+      "plannedFiles": [
+        "reference/helm/index.md",
+        "reference/contracts/index.md",
+        "diagrams/helm-interactions.mmd",
+        "reference/system-architecture.md",
+        "reference/subsystems/app-common.md"
+      ],
+      "score": {
+        "accuracy": 4,
+        "completeness": 5,
+        "precision": 2,
+        "navigation": 4,
+        "notes": "AI correctly identified all four secrets and their associated charts. However, precision suffers from: (1) duplicating 'cluster' entries for logdna-agent-key and logdna-agent-tags with different paths when ground truth shows each only once, (2) including unnecessary full file paths that weren't requested, (3) formatting as numbered list with bold headers instead of the simple key-value format expected. The answer demonstrates successful information retrieval but poor adherence to the expected output format and includes potentially hallucinated path duplications."
+      },
+      "composite": 75,
+      "notFound": false
+    },
+    {
+      "id": "config-mdm-app-count",
+      "category": "configuration",
+      "difficulty": "easy",
+      "question": "How many top-level configuration keys does the mdm-app chart (at app-common/charts/mdm-app) expose in its values.yaml?",
+      "groundTruth": "38",
+      "llmAnswer": "\n\nNOT_FOUND\n\nThe provided documentation files do not contain the complete contract definition for the mdm-app chart's values.yaml. While the chart is listed in the Helm Charts index (version 0.7.4 at `app-common/charts/mdm-app`), the detailed values schema showing the top-level configuration keys is not included in the excerpts provided.",
+      "filesRead": [
+        "reference/helm/index.md",
+        "reference/subsystems/app-common.md",
+        "reference/contracts/index.md"
+      ],
+      "plannedFiles": [
+        "reference/helm/index.md",
+        "reference/subsystems/app-common.md",
+        "reference/contracts/index.md"
+      ],
+      "score": {
+        "accuracy": 0,
+        "completeness": 0,
+        "precision": 5,
+        "navigation": 0,
+        "notes": "AI failed to locate the mdm-app chart's values.yaml file and retrieve the top-level configuration key count. Response was 'NOT_FOUND' instead of the correct answer of 38 keys. While the agent was honest about not finding the information (no hallucinations), it did not successfully navigate to or examine the target file."
+      },
+      "composite": 25,
+      "notFound": true
+    },
+    {
+      "id": "config-ai-app-count",
+      "category": "configuration",
+      "difficulty": "easy",
+      "question": "How many top-level configuration keys does the ai-app chart (at app-common/charts/ai-app) expose in its values.yaml?",
+      "groundTruth": "34",
+      "llmAnswer": "Looking at the `reference/contracts/index.md` file, I can find the `ai_app_values` class which defines the top-level configuration keys for the ai-app chart at `app-common/charts/ai-app`.\n\nCounting the properties listed in the `ai_app_values` class:\n\nclusterName, environmentType, arch, cloudProvider, region, vaultEnv, imagePullPolicy, clusterDomain, terminationGracePeriodSeconds, etalonsubDomain, domain, rootDomain, reltioDomains, spotinstPercent, nodeSelector, tolerations, ingress, serviceMesh, baseRegistryURL, imagePullSecrets, commonLabels, commonPodLabels, commonJobLabels, commonAnnotations, jobAnnotations, gcrregcred, global, commonservicesValues, efsClaimHeapDumps, securityContext, persistent, enableRdmAppServices, helmTimeout, hooks\n\n**34**",
+      "filesRead": [
+        "reference/subsystems/app-common.md",
+        "reference/helm/index.md",
+        "reference/contracts/index.md",
+        "reference/system-architecture.md"
+      ],
+      "plannedFiles": [
+        "reference/helm/charts/ai-app.md",
+        "reference/subsystems/app-common.md",
+        "reference/helm/index.md",
+        "reference/contracts/index.md",
+        "reference/system-architecture.md"
+      ],
+      "score": {
+        "accuracy": 5,
+        "completeness": 5,
+        "precision": 5,
+        "navigation": 5,
+        "notes": "AI provided exact answer (34) matching ground truth. Demonstrated clear navigation by locating reference/contracts/index.md and identifying the ai_app_values class. Listed all 34 properties with no hallucinations or extraneous information. Methodology was sound and conclusion precise."
+      },
+      "composite": 100,
+      "notFound": false
+    },
+    {
+      "id": "config-elasticsearch-count",
+      "category": "configuration",
+      "difficulty": "easy",
+      "question": "How many top-level configuration keys does the elasticsearch chart (at app-common/apps-provider/addons/charts/state-apps/es) expose in its values.yaml?",
+      "groundTruth": "32",
+      "llmAnswer": "Looking at the contracts documentation for the elasticsearch chart at `app-common/apps-provider/addons/charts/state-apps/es`, the `elasticsearch_values` class shows the following top-level configuration keys:\n\nenvName, esClusterName, domain, lb, cloudProvider, esVersion, esMasterNodePerZoneCount, esDataNodePerZoneCount, xpkNodeCount, esZone1, esZone2, esZone3, masterStorageSize, masterStorageClass, dataStorageSize, dataStorageClass, masterJavaOpts, masterCpus, masterMems, dataJavaOpts, dataCpus, dataMems, dataVNGName, arch, onDemand, customConfig, customConfig_master, kibanaDomain, authorizationPolicy, metrics, livenessProbe, enableAnonymousAccess\n\n**32**",
+      "filesRead": [
+        "reference/helm/index.md",
+        "reference/subsystems/app-common.md",
+        "reference/contracts/index.md",
+        "diagrams/app-common-contracts.mmd"
+      ],
+      "plannedFiles": [
+        "reference/helm/index.md",
+        "reference/subsystems/app-common.md",
+        "reference/helm/charts/",
+        "reference/contracts/index.md",
+        "diagrams/app-common-contracts.mmd"
+      ],
+      "score": {
+        "accuracy": 5,
+        "completeness": 5,
+        "precision": 5,
+        "navigation": 5,
+        "notes": "AI provided the correct count (32) with all keys explicitly listed. Demonstrated direct navigation to the elasticsearch_values class in the contracts documentation at the correct chart path. No hallucinations or extraneous information present."
+      },
+      "composite": 100,
+      "notFound": false
+    },
+    {
+      "id": "config-hazelcast-count",
+      "category": "configuration",
+      "difficulty": "easy",
+      "question": "How many top-level configuration keys does the hazelcast chart (at app-common/apps-provider/addons/charts/state-apps/hz) expose in its values.yaml?",
+      "groundTruth": "32",
+      "llmAnswer": "NOT_FOUND\n\nThe specific chart documentation file for `app-common/apps-provider/addons/charts/state-apps/hz` is not included in the provided reference materials. While the chart is listed in the Helm Charts index, the detailed values.yaml documentation that would show the exact number of top-level configuration keys is not available.",
+      "filesRead": [
+        "reference/helm/index.md",
+        "reference/subsystems/app-common.md",
+        "reference/contracts/index.md",
+        "diagrams/app-common-contracts.mmd",
+        "reference/system-architecture.md"
+      ],
+      "plannedFiles": [
+        "reference/helm/index.md",
+        "reference/subsystems/app-common.md",
+        "reference/contracts/index.md",
+        "diagrams/app-common-contracts.mmd",
+        "reference/system-architecture.md"
+      ],
+      "score": {
+        "accuracy": 0,
+        "completeness": 0,
+        "precision": 5,
+        "navigation": 0,
+        "notes": "AI agent failed to locate the hazelcast chart's values.yaml file at the specified path (app-common/apps-provider/addons/charts/state-apps/hz). The correct answer is 32 top-level configuration keys. While the agent's reasoning about missing reference materials is precise and doesn't hallucinate facts, it demonstrates complete failure in navigation and retrieval of the required information."
+      },
+      "composite": 25,
+      "notFound": true
+    },
+    {
+      "id": "config-redis-count",
+      "category": "configuration",
+      "difficulty": "easy",
+      "question": "How many top-level configuration keys does the redis chart (at app-common/charts/redis) expose in its values.yaml?",
+      "groundTruth": "23",
+      "llmAnswer": "NOT_FOUND\n\nThe provided documentation includes the redis chart in the Helm Charts index (version 1.0.0 at `app-common/charts/redis`), but does not contain the detailed values.yaml structure showing the top-level configuration keys for this chart.",
+      "filesRead": [
+        "reference/helm/index.md",
+        "reference/subsystems/app-common.md"
+      ],
+      "plannedFiles": [
+        "reference/helm/index.md",
+        "reference/subsystems/app-common.md",
+        "reference/helm/charts/redis/values.yaml"
+      ],
+      "score": {
+        "accuracy": 0,
+        "completeness": 0,
+        "precision": 5,
+        "navigation": 0,
+        "notes": "AI answered NOT_FOUND when the correct answer is 23 top-level configuration keys. While the agent's response reflects that detailed values.yaml structure wasn't in the provided documentation, it failed to retrieve or provide the factually correct answer. The agent demonstrated no navigation to the actual redis chart values.yaml file to count the keys."
+      },
+      "composite": 25,
+      "notFound": true
+    },
+    {
+      "id": "interaction-001",
+      "category": "interaction",
+      "difficulty": "medium",
+      "question": "Which Helm charts reference Kubernetes services from other components? List each chart and the service it calls.",
+      "groundTruth": "hazelcast (app-common/apps-provider/addons/charts/state-apps/hz) → kubernetes\nhazelcast (app-common/charts/hazelcast) → kubernetes\ngw-metrics (app-common/charts/otel-collector/charts/gw-metrics) → argo-rollouts-metrics\nredis (app-common/charts/redis) → headless",
+      "llmAnswer": "Based on the documentation provided, here are the Helm charts that reference Kubernetes services from other components:\n\n1. **hazelcast** (`app-common/apps-provider/addons/charts/state-apps/hz`) - references `kubernetes` service\n2. **hazelcast** (`app-common/charts/hazelcast`) - references `kubernetes` service\n3. **gw-metrics** (`app-common/charts/otel-collector/charts/gw-metrics`) - references `argo-rollouts-metrics` service\n4. **redis** (`app-common/charts/redis`) - references `headless` service",
+      "filesRead": [
+        "reference/helm/index.md",
+        "reference/contracts/index.md",
+        "reference/system-architecture.md",
+        "diagrams/helm-interactions.mmd"
+      ],
+      "plannedFiles": [
+        "reference/helm/index.md",
+        "reference/contracts/index.md",
+        "reference/system-architecture.md",
+        "diagrams/helm-interactions.mmd",
+        "reference/helm/charts/"
+      ],
+      "score": {
+        "accuracy": 5,
+        "completeness": 5,
+        "precision": 5,
+        "navigation": 4,
+        "notes": "AI answer matches ground truth exactly on all four chart-service pairs with correct paths and service names. No hallucinations or omissions. Navigation score reduced to 4 rather than 5 due to lack of visibility into the discovery process, though results suggest effective information retrieval."
+      },
+      "composite": 95,
+      "notFound": false
+    },
+    {
+      "id": "interaction-002",
+      "category": "interaction",
+      "difficulty": "hard",
+      "question": "Which network ports are used by multiple Helm charts? List each port and the charts that expose it.",
+      "groundTruth": "Port 443: cluster, twistlock-defender, eck-operator, twistlock\nPort 8889: gw-metrics, gw-traces, kube-state-metrics\nPort 9200: eck-operator, elasticsearch\nPort 13133: agent, gw-metrics, gw-traces, k8s-metrics, prom-scraper, elasticsearch",
+      "llmAnswer": "Based on the documentation provided, here are the network ports used by multiple Helm charts:\n\n**Port 443:**\n- cluster\n- twistlock-defender\n- eck-operator\n\nThis is the only port that appears across multiple Helm charts in the provided documentation.",
+      "filesRead": [
+        "reference/helm/index.md",
+        "reference/system-architecture.md",
+        "reference/contracts/index.md",
+        "diagrams/helm-interactions.mmd",
+        "reference/subsystems/network-core.md"
+      ],
+      "plannedFiles": [
+        "reference/helm/index.md",
+        "reference/system-architecture.md",
+        "reference/contracts/index.md",
+        "diagrams/helm-interactions.mmd",
+        "reference/subsystems/network-core.md"
+      ],
+      "score": {
+        "accuracy": 2,
+        "completeness": 1,
+        "precision": 2,
+        "navigation": 1,
+        "notes": "AI identified only 1 of 4 ports (port 443) and missed one chart for that port (twistlock). Completely failed to identify ports 8889, 9200, and 13133. The claim that port 443 is 'the only port' contradicts ground truth showing 4 ports used by multiple charts. Demonstrates poor information retrieval or severely limited documentation access."
+      },
+      "composite": 30,
+      "notFound": false
+    },
+    {
+      "id": "resource-001",
+      "category": "resource",
+      "difficulty": "medium",
+      "question": "What are the 10 most common Kubernetes resource types generated across all Foxtrot Helm charts?",
+      "groundTruth": "ClusterRole: 51\nServiceAccount: 49\nSecret: 43\nExternalSecret: 38\nService: 36\nRole: 30\nClusterRoleBinding: 29\nDeployment: 29\nConfigMap: 25\nRoleBinding: 25",
+      "llmAnswer": "\n\n1. ClusterRole - 51\n2. ServiceAccount - 49\n3. Secret - 43\n4. ExternalSecret - 38\n5. Service - 36\n6. Role - 30\n7. ClusterRoleBinding - 29\n8. Deployment - 29\n9. ConfigMap - 25\n10. RoleBinding - 25",
+      "filesRead": [
+        "reference/helm/index.md",
+        "reference/contracts/index.md",
+        "reference/system-architecture.md",
+        "diagrams/all-contracts.mmd",
+        "reference/subsystems/app-common.md"
+      ],
+      "plannedFiles": [
+        "reference/helm/index.md",
+        "reference/contracts/index.md",
+        "reference/system-architecture.md",
+        "diagrams/all-contracts.mmd",
+        "reference/subsystems/app-common.md"
+      ],
+      "score": {
+        "accuracy": 5,
+        "completeness": 5,
+        "precision": 5,
+        "navigation": 5,
+        "notes": "AI answer matches ground truth exactly—all 10 resource types present with correct counts and ranking. No hallucinations or extraneous information. Perfect alignment across all dimensions."
+      },
+      "composite": 100,
+      "notFound": false
+    },
+    {
+      "id": "resource-mdm-app-001",
+      "category": "resource",
+      "difficulty": "medium",
+      "question": "What Kubernetes resource types does the mdm-app chart generate? List each type and count.",
+      "groundTruth": "Secret: 5\nJob: 4\nDeployment: 2\nPersistentVolumeClaim: 2\nService: 2\nExternalSecret: 2\nHorizontalPodAutoscaler: 1\nRollout: 1\nPersistentVolume: 1\nPodDisruptionBudget: 1\nServiceAccount: 1",
+      "llmAnswer": "NOT_FOUND\n\nThe provided documentation shows that the mdm-app chart generates 22 resources total, but does not specify the individual Kubernetes resource types or their counts.",
+      "filesRead": [
+        "reference/helm/index.md",
+        "reference/contracts/index.md",
+        "diagrams/helm-interactions.mmd",
+        "reference/subsystems/app-common.md"
+      ],
+      "plannedFiles": [
+        "reference/helm/index.md",
+        "reference/contracts/index.md",
+        "diagrams/helm-interactions.mmd",
+        "reference/helm/charts/",
+        "reference/subsystems/app-common.md"
+      ],
+      "score": {
+        "accuracy": 0,
+        "completeness": 0,
+        "precision": 5,
+        "navigation": 0,
+        "notes": "Agent failed to locate the specific Kubernetes resource types and their counts from the mdm-app chart. While it identified that 22 resources total are generated, it did not extract the breakdown by resource type (Secret: 5, Job: 4, etc.). The answer 'NOT_FOUND' is honest but represents complete failure to answer the question despite ground truth being available."
+      },
+      "composite": 25,
+      "notFound": true
+    },
+    {
+      "id": "resource-cassandra-001",
+      "category": "resource",
+      "difficulty": "medium",
+      "question": "What Kubernetes resource types does the cassandra chart generate? List each type and count.",
+      "groundTruth": "Service: 2\nStatefulSet: 1\nExternalSecret: 1\nClusterSecretStore: 1\nConfigMap: 1\nSecret: 1\nServiceAccount: 1\nRole: 1\nRoleBinding: 1\nClusterRole: 1\nClusterRoleBinding: 1\nCronJob: 1\nJob: 1",
+      "llmAnswer": "Based on the contracts documentation provided, the cassandra chart generates the following Kubernetes resource types:\n\n- StatefulSet: 1\n- Service: 2\n- CronJob: 1\n- Job: 1\n\nNote: The helm index indicates the cassandra chart contains 14 total resources, but the contracts documentation only details these 5 resources. The remaining 9 resources are not specified in the provided files.",
+      "filesRead": [
+        "reference/helm/index.md",
+        "reference/contracts/index.md",
+        "diagrams/all-contracts.mmd",
+        "reference/system-architecture.md"
+      ],
+      "plannedFiles": [
+        "reference/helm/charts/cassandra.md",
+        "reference/helm/index.md",
+        "reference/contracts/index.md",
+        "diagrams/all-contracts.mmd",
+        "reference/system-architecture.md"
+      ],
+      "score": {
+        "accuracy": 3,
+        "completeness": 1,
+        "precision": 4,
+        "navigation": 2,
+        "notes": "AI correctly identified 4 of 13 resource types with accurate counts, but missed 9 types entirely (ExternalSecret, ClusterSecretStore, ConfigMap, Secret, ServiceAccount, Role, RoleBinding, ClusterRole, ClusterRoleBinding). The agent acknowledged incomplete documentation access rather than locating comprehensive resource specifications. No hallucinations, but severe coverage gap indicates failure to find authoritative source."
+      },
+      "composite": 50,
+      "notFound": false
+    },
+    {
+      "id": "resource-jenkins-001",
+      "category": "resource",
+      "difficulty": "medium",
+      "question": "What Kubernetes resource types does the jenkins chart generate? List each type and count.",
+      "groundTruth": "ConfigMap: 5\nService: 3\nSecret: 3\nIngress: 2\nServiceAccount: 2\nPersistentVolumeClaim: 1\nSecurityGroupPolicy: 1\nPrometheusRule: 1\nBackendConfig: 1\nNetworkPolicy: 1\nPodDisruptionBudget: 1\nPodMonitoring: 1\nRoute: 1\nServiceMonitor: 1\nStatefulSet: 1\nRole: 1\nRoleBinding: 1\nSecretClaim: 1\nPod: 1",
+      "llmAnswer": "NOT_FOUND\n\nThe documentation shows that the jenkins chart (version 5.8.45) generates 29 resources total, but the specific Kubernetes resource types and their individual counts are not detailed in the provided files.",
+      "filesRead": [
+        "reference/helm/index.md",
+        "diagrams/helm-interactions.mmd",
+        "reference/contracts/index.md",
+        "diagrams/all-contracts.mmd"
+      ],
+      "plannedFiles": [
+        "reference/helm/index.md",
+        "reference/helm/charts/jenkins.md",
+        "diagrams/helm-interactions.mmd",
+        "reference/contracts/index.md",
+        "diagrams/all-contracts.mmd"
+      ],
+      "score": {
+        "accuracy": 0,
+        "completeness": 0,
+        "precision": 5,
+        "navigation": 0,
+        "notes": "AI claimed information was not available in provided files, but ground truth demonstrates 19 distinct Kubernetes resource types with specific counts are documented. Agent failed to locate or extract the required data."
+      },
+      "composite": 25,
+      "notFound": true
+    },
+    {
+      "id": "cross-sub-001",
+      "category": "cross-subsystem",
+      "difficulty": "hard",
+      "question": "Which subsystems depend on app-tools, and how heavily (by call+import count)?",
+      "groundTruth": "ipam-tools: 6\ncompute-tools: 4\napp-common: 1",
+      "llmAnswer": "Based on the Cross-Subsystem Dependencies table in the system architecture document:\n\n- **ipam-tools**: 6 calls + 0 imports = 6 total\n- **compute-tools**: 4 calls + 0 imports = 4 total\n- **app-common**: 1 call + 0 imports = 1 total",
+      "filesRead": [
+        "reference/subsystems/app-tools.md",
+        "reference/contracts/index.md",
+        "reference/system-architecture.md",
+        "reference/subsystems/app-common.md",
+        "diagrams/all-contracts.mmd"
+      ],
+      "plannedFiles": [
+        "reference/subsystems/app-tools.md",
+        "reference/contracts/index.md",
+        "reference/system-architecture.md",
+        "reference/subsystems/app-common.md",
+        "diagrams/all-contracts.mmd"
+      ],
+      "score": {
+        "accuracy": 5,
+        "completeness": 5,
+        "precision": 5,
+        "navigation": 5,
+        "notes": "AI answer matches ground truth exactly on all three subsystems with correct dependency counts (ipam-tools: 6, compute-tools: 4, app-common: 1). The breakdown into calls and imports adds useful detail without introducing errors. Proper source attribution demonstrates successful navigation to the architecture documentation."
+      },
+      "composite": 100,
+      "notFound": false
+    },
+    {
+      "id": "arch-002",
+      "category": "architectural",
+      "difficulty": "easy",
+      "question": "What is the current version and appVersion of the mdm-app Helm chart?",
+      "groundTruth": "version: 0.7.4, appVersion: 2025.12-12-develop",
+      "llmAnswer": "NOT_FOUND\n\nThe documentation provided shows the mdm-app Helm chart version is **0.7.4**, but the appVersion is not included in the reference materials.",
+      "filesRead": [
+        "reference/helm/index.md",
+        "reference/contracts/index.md",
+        "reference/system-architecture.md"
+      ],
+      "plannedFiles": [
+        "reference/helm/index.md",
+        "reference/helm/charts/",
+        "reference/contracts/index.md",
+        "reference/system-architecture.md"
+      ],
+      "score": {
+        "accuracy": 0,
+        "completeness": 0,
+        "precision": 5,
+        "navigation": 0,
+        "notes": "AI agent returned NOT_FOUND despite ground truth showing version 0.7.4 and appVersion 2025.12-12-develop are available. Agent failed to locate the mdm-app Helm chart metadata. No hallucinated data, but complete failure to retrieve required information."
+      },
+      "composite": 25,
+      "notFound": true
+    }
+  ]
+}