dev-intel-v2/eval-agent-report-v2.json

{
  "evalType": "agent",
  "timestamp": "2026-03-10T00:05:59.138Z",
  "docsDir": "./foxtrot-docs",
  "totalQuestions": 22,
  "overallScore": "56.6",
  "avgAccuracy": "2.36",
  "avgCompleteness": "2.23",
  "avgPrecision": "4.50",
  "avgNavigation": "2.23",
  "notFoundCount": 9,
  "notFoundRate": "40.9%",
  "byCategory": {
    "structural": {
      "avg": "68.3",
      "count": 3
    },
    "dependency": {
      "avg": "39.0",
      "count": 5
    },
    "contract": {
      "avg": "65.0",
      "count": 1
    },
    "configuration": {
      "avg": "66.0",
      "count": 5
    },
    "interaction": {
      "avg": "75.0",
      "count": 2
    },
    "resource": {
      "avg": "43.8",
      "count": 4
    },
    "cross-subsystem": {
      "avg": "100.0",
      "count": 1
    },
    "architectural": {
      "avg": "25.0",
      "count": 1
    }
  },
  "byDifficulty": {
    "easy": {
      "avg": "58.1",
      "count": 8
    },
    "medium": {
      "avg": "52.5",
      "count": 12
    },
    "hard": {
      "avg": "75.0",
      "count": 2
    }
  },
  "results": [
    {
      "id": "struct-001",
      "category": "structural",
      "difficulty": "easy",
      "question": "How many Helm charts are defined across the Foxtrot monorepo?",
      "groundTruth": "76",
      "llmAnswer": "76",
      "filesRead": [
        "reference/helm/index.md",
        "reference/system-architecture.md",
        "diagrams/helm-interactions.mmd",
        "reference/contracts/index.md"
      ],
      "plannedFiles": [
        "reference/helm/index.md",
        "reference/system-architecture.md",
        "reference/helm/charts/",
        "diagrams/helm-interactions.mmd",
        "reference/contracts/index.md"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 5,
        "precision": 5,
        "navigation": 5,
        "notes": "AI provided the exact correct answer (76) matching ground truth. No extraneous information or hallucinations present. Correct answer indicates successful navigation to relevant Helm chart definitions in the monorepo."
      },
      "composite": 100,
      "notFound": false
    },
    {
      "id": "struct-002",
      "category": "structural",
      "difficulty": "medium",
      "question": "Which 5 Helm charts produce the most Kubernetes resources? List them with their resource counts.",
      "groundTruth": "cluster (app-common/apps-provider/addons/charts/common/cluster): 55\ncluster (app-common/charts/cluster): 32\njenkins (app-common/charts/jenkins): 29\nkube-state-metrics (app-common/charts/otel-collector/charts/kube-state-metrics): 27\nfoxtrot-eks (compute-common/templates/foxtrot-eks): 24",
      "llmAnswer": "1. cluster (`app-common/apps-provider/addons/charts/common/cluster`) - 55 resources\n2. cluster (`app-common/charts/cluster`) - 32 resources\n3. jenkins - 29 resources\n4. kube-state-metrics - 27 resources\n5. foxtrot-eks - 24 resources",
      "filesRead": [
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "reference/system-architecture.md",
        "diagrams/helm-interactions.mmd"
      ],
      "plannedFiles": [
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "reference/system-architecture.md",
        "diagrams/helm-interactions.mmd",
        "reference/helm/charts/"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 4,
        "precision": 5,
        "navigation": 5,
        "notes": "All resource counts and rankings are correct. Full paths provided for first two charts but abbreviated for last three (jenkins, kube-state-metrics, foxtrot-eks lack full directory paths shown in ground truth). Core facts are accurate with no hallucinations."
      },
      "composite": 95,
      "notFound": false
    },
    {
      "id": "struct-003",
      "category": "structural",
      "difficulty": "easy",
      "question": "How many subsystems does the Foxtrot codebase contain?",
      "groundTruth": "11",
      "llmAnswer": "\n\n12",
      "filesRead": [
        "reference/system-architecture.md",
        "reference/subsystems/root.md",
        "reference/contracts/index.md",
        "reference/helm/index.md"
      ],
      "plannedFiles": [
        "reference/system-architecture.md",
        "reference/subsystems/root.md",
        "reference/contracts/index.md",
        "reference/helm/index.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 1,
        "navigation": 1,
        "notes": "AI provided 12 subsystems instead of the correct answer of 11. Off-by-one error suggests either miscounting during analysis or hallucination of an additional subsystem. No evidence of proper source verification or navigation to authoritative documentation."
      },
      "composite": 10,
      "notFound": false
    },
    {
      "id": "dep-external-dns-001",
      "category": "dependency",
      "difficulty": "medium",
      "question": "What are the dependencies of the \"external-dns\" chart (at app-common/apps-provider/addons/charts/common/external-dns)?",
      "groundTruth": "external-dns (1.17.0) [condition: external-dns.enabled]",
      "llmAnswer": "\n\nNOT_FOUND\n\nThe documentation indicates that the external-dns chart at `app-common/apps-provider/addons/charts/common/external-dns` has 1 dependency (shown in the \"All Charts\" table), but the specific name of that dependency is not detailed in the provided files.",
      "filesRead": [
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/contracts/index.md",
        "reference/system-architecture.md",
        "diagrams/app-common-contracts.mmd"
      ],
      "plannedFiles": [
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/contracts/index.md",
        "reference/system-architecture.md",
        "diagrams/app-common-contracts.mmd"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI responded with NOT_FOUND despite ground truth showing the dependency exists (external-dns 1.17.0 with condition). The agent acknowledged 1 dependency exists but failed to identify or retrieve the actual dependency name, version, and condition. This represents a navigation failure to locate the required information in the chart files."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "dep-ingress-nginx-001",
      "category": "dependency",
      "difficulty": "medium",
      "question": "What are the dependencies of the \"ingress-nginx\" chart (at app-common/apps-provider/addons/charts/common/ingress-nginx)?",
      "groundTruth": "ingress-nginx (4.12.0) [condition: ingress-nginx.enabled]",
      "llmAnswer": "\n\nNOT_FOUND\n\nThe documentation shows that the ingress-nginx chart at `app-common/apps-provider/addons/charts/common/ingress-nginx` has 1 dependency (from the chart table), but the specific name of that dependency is not detailed in the provided files.",
      "filesRead": [
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/contracts/index.md",
        "diagrams/app-common-contracts.mmd",
        "reference/system-architecture.md"
      ],
      "plannedFiles": [
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/contracts/index.md",
        "diagrams/app-common-contracts.mmd",
        "reference/system-architecture.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI answered NOT_FOUND despite ground truth clearly specifying ingress-nginx (4.12.0) as a dependency with condition. Per evaluation rules, NOT_FOUND responses receive 0/0/5/0 scoring."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "dep-kube-state-metrics-001",
      "category": "dependency",
      "difficulty": "medium",
      "question": "What are the dependencies of the \"kube-state-metrics\" chart (at app-common/apps-provider/addons/charts/common/kube-state-metrics)?",
      "groundTruth": "kube-state-metrics (5.26.0) [condition: kube-state-metrics.enabled]",
      "llmAnswer": "NOT_FOUND\n\nThe documentation index indicates that the kube-state-metrics chart at `app-common/apps-provider/addons/charts/common/kube-state-metrics` has 1 dependency, but the specific dependency details are not included in the provided reference files. The detailed chart documentation file would be needed to identify the exact dependency.",
      "filesRead": [
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/contracts/index.md",
        "diagrams/app-common-contracts.mmd",
        "reference/system-architecture.md"
      ],
      "plannedFiles": [
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/contracts/index.md",
        "diagrams/app-common-contracts.mmd",
        "reference/system-architecture.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI failed to locate the kube-state-metrics dependency (5.26.0) that exists in the chart. Returned NOT_FOUND instead of identifying the dependency with its version and condition constraint. No navigation to the correct chart documentation file was demonstrated."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "dep-open-telemetry-001",
      "category": "dependency",
      "difficulty": "medium",
      "question": "What are the dependencies of the \"open-telemetry\" chart (at app-common/apps-provider/addons/charts/common/otel-collector)?",
      "groundTruth": "opentelemetry-operator (0.26.4) [condition: opentelemetry-operator.enabled]\nk8s-metrics (0.1.0) [condition: k8s-metrics.enabled]\nagent (0.1.1) [condition: agent.enabled]\ncommon (0.1.0) [condition: common.enabled]\ngw-metrics (0.1.0) [condition: gw-metrics.enabled]\ngw-traces (0.1.0) [condition: gw-traces.enabled]\nprom-scraper (0.1.0) [condition: prom-scraper.enabled]",
      "llmAnswer": "Based on the documentation provided, the \"open-telemetry\" chart at `app-common/apps-provider/addons/charts/common/otel-collector` has 7 dependencies:\n\n1. opentelemetry-operator\n2. agent\n3. common\n4. gw-metrics\n5. gw-traces\n6. k8s-metrics\n7. prom-scraper",
      "filesRead": [
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/contracts/index.md",
        "diagrams/app-common-contracts.mmd"
      ],
      "plannedFiles": [
        "reference/helm/index.md",
        "reference/helm/charts/",
        "reference/subsystems/app-common.md",
        "reference/contracts/index.md",
        "diagrams/app-common-contracts.mmd"
      ],
      "score": {
        "accuracy": 3,
        "completeness": 2,
        "precision": 4,
        "navigation": 3,
        "notes": "AI correctly identified all 7 dependency names but omitted critical details present in ground truth: version numbers (e.g., 0.26.4, 0.1.0, 0.1.1) and conditional flags (e.g., [condition: opentelemetry-operator.enabled]). The answer is structurally sound with no hallucinations, but lacks the specificity required for a complete dependency specification. Navigation appears successful given accurate name retrieval, but no evidence of the discovery process is shown."
      },
      "composite": 60,
      "notFound": false
    },
    {
      "id": "dep-reltio-prometheus-001",
      "category": "dependency",
      "difficulty": "medium",
      "question": "What are the dependencies of the \"reltio-prometheus\" chart (at app-common/apps-provider/addons/charts/common/prometheus)?",
      "groundTruth": "kube-prometheus-stack (69.4.1) [condition: kube-prometheus-stack.enabled]\nprometheus-adapter (4.11.0) [condition: prometheus-adapter.enabled]",
      "llmAnswer": "Based on the documentation provided, the reltio-prometheus chart (at `app-common/apps-provider/addons/charts/common/prometheus`) has **2 dependencies**.\n\nFrom the contracts section, these dependencies are:\n- kube-prometheus-stack\n- prometheus-adapter",
      "filesRead": [
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/contracts/index.md",
        "diagrams/app-common-contracts.mmd",
        "reference/system-architecture.md"
      ],
      "plannedFiles": [
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/contracts/index.md",
        "diagrams/app-common-contracts.mmd",
        "reference/system-architecture.md"
      ],
      "score": {
        "accuracy": 3,
        "completeness": 2,
        "precision": 4,
        "navigation": 3,
        "notes": "AI correctly identified both dependency names (kube-prometheus-stack, prometheus-adapter) but omitted critical specification details: version numbers (69.4.1, 4.11.0) and conditional flags (kube-prometheus-stack.enabled, prometheus-adapter.enabled). For Helm chart dependencies, versions and conditions are essential metadata. The answer demonstrates navigation capability but incomplete extraction of ground truth information."
      },
      "composite": 60,
      "notFound": false
    },
    {
      "id": "contract-001",
      "category": "contract",
      "difficulty": "medium",
      "question": "Which secrets or ConfigMaps are shared across multiple Helm charts? List each with the charts that use it.",
      "groundTruth": "logdna-agent-key: cluster\nlogdna-agent-tags: cluster\nvault-secret: ai-app, cleanse-app, pms-app, rdm-app, shared-app\nvault-secret-rdm-tasks: ai-app, cleanse-app, pms-app, rdm-app, shared-app",
      "llmAnswer": "Based on the helm-interactions diagram in the provided documentation, the following secrets/ConfigMaps are shared across multiple Helm charts:\n\n1. **logdna-agent-key**\n   - cluster (app-common/apps-provider/addons/charts/common/cluster)\n   - cluster (app-common/charts/cluster)\n\n2. **logdna-agent-tags**\n   - cluster (app-common/apps-provider/addons/charts/common/cluster)\n   - cluster (app-common/charts/cluster)\n\n3. **vault-secret**\n   - ai-app\n   - cleanse-app\n   - pms-app\n   - rdm-app\n   - shared-app\n\n4. **vault-secret-rdm-tasks**\n   - ai-app\n   - cleanse-app\n   - pms-app\n   - rdm-app\n   - shared-app",
      "filesRead": [
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "diagrams/helm-interactions.mmd",
        "reference/system-architecture.md",
        "reference/subsystems/app-common.md"
      ],
      "plannedFiles": [
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "diagrams/helm-interactions.mmd",
        "reference/system-architecture.md",
        "reference/subsystems/app-common.md"
      ],
      "score": {
        "accuracy": 3,
        "completeness": 5,
        "precision": 2,
        "navigation": 3,
        "notes": "The answer correctly identifies all four secrets/ConfigMaps and their associations. However, it duplicates 'cluster' entries for logdna-agent-key and logdna-agent-tags (showing two different paths), when the ground truth shows each should appear once. This duplication represents hallucinated extra information not in the ground truth. The vault-secret entries are accurate and precise. The agent demonstrated ability to locate relevant documentation but may have misinterpreted the source data regarding the cluster entries."
      },
      "composite": 65,
      "notFound": false
    },
    {
      "id": "config-mdm-app-count",
      "category": "configuration",
      "difficulty": "easy",
      "question": "How many top-level configuration keys does the mdm-app chart (at app-common/charts/mdm-app) expose in its values.yaml?",
      "groundTruth": "38",
      "llmAnswer": "\n\nNOT_FOUND\n\nThe provided files reference the mdm-app chart but do not include the detailed values contract or configuration for `app-common/charts/mdm-app`. The contracts file is truncated before reaching the mdm-app values definition.",
      "filesRead": [
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/contracts/index.md"
      ],
      "plannedFiles": [
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/contracts/index.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI failed to locate or access the mdm-app/values.yaml file needed to count configuration keys. While the response honestly acknowledged the limitation rather than hallucinating a number, it provided zero correct information against the ground truth of 38 keys. The agent could not navigate to the required file."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "config-ai-app-count",
      "category": "configuration",
      "difficulty": "easy",
      "question": "How many top-level configuration keys does the ai-app chart (at app-common/charts/ai-app) expose in its values.yaml?",
      "groundTruth": "34",
      "llmAnswer": "34",
      "filesRead": [
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/contracts/index.md",
        "diagrams/app-common-contracts.mmd"
      ],
      "plannedFiles": [
        "reference/helm/charts/ai-app.md",
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/contracts/index.md",
        "diagrams/app-common-contracts.mmd"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 5,
        "precision": 5,
        "navigation": 4,
        "notes": "AI provided the exact correct count of 34 top-level keys. Answer is factually accurate, complete for the question asked, and contains no extraneous information. Navigation score is 4 rather than 5 because while the correct answer indicates the agent found the right file, the evaluation lacks explicit evidence of the navigation process or steps taken to locate and analyze the values.yaml file."
      },
      "composite": 95,
      "notFound": false
    },
    {
      "id": "config-elasticsearch-count",
      "category": "configuration",
      "difficulty": "easy",
      "question": "How many top-level configuration keys does the elasticsearch chart (at app-common/apps-provider/addons/charts/state-apps/es) expose in its values.yaml?",
      "groundTruth": "32",
      "llmAnswer": "Looking at the contracts diagram in the provided documentation, the `elasticsearch_values` class for the chart at `app-common/apps-provider/addons/charts/state-apps/es` exposes the following top-level configuration keys:\n\nenvName, esClusterName, domain, lb, cloudProvider, esVersion, esMasterNodePerZoneCount, esDataNodePerZoneCount, xpkNodeCount, esZone1, esZone2, esZone3, masterStorageSize, masterStorageClass, dataStorageSize, dataStorageClass, masterJavaOpts, masterCpus, masterMems, dataJavaOpts, dataCpus, dataMems, dataVNGName, arch, onDemand, customConfig, customConfig_master, kibanaDomain, authorizationPolicy, metrics, livenessProbe, enableAnonymousAccess\n\n**32**",
      "filesRead": [
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/contracts/index.md",
        "diagrams/app-common-contracts.mmd"
      ],
      "plannedFiles": [
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/helm/charts/",
        "reference/contracts/index.md",
        "diagrams/app-common-contracts.mmd"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 5,
        "precision": 4,
        "navigation": 4,
        "notes": "AI provided the exact count (32) matching ground truth and listed all 32 keys with domain-appropriate naming conventions (cluster config, storage, resources, zones). Demonstrated correct chart location identification. Minor precision deduction due to inability to independently verify each key without direct values.yaml access, though the comprehensive list and correct count suggest high reliability."
      },
      "composite": 90,
      "notFound": false
    },
    {
      "id": "config-hazelcast-count",
      "category": "configuration",
      "difficulty": "easy",
      "question": "How many top-level configuration keys does the hazelcast chart (at app-common/apps-provider/addons/charts/state-apps/hz) expose in its values.yaml?",
      "groundTruth": "32",
      "llmAnswer": "32",
      "filesRead": [
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/contracts/index.md",
        "diagrams/app-common-contracts.mmd"
      ],
      "plannedFiles": [
        "reference/helm/index.md",
        "reference/helm/charts/",
        "reference/subsystems/app-common.md",
        "reference/contracts/index.md",
        "diagrams/app-common-contracts.mmd"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 5,
        "precision": 5,
        "navigation": 4,
        "notes": "AI provided the exact correct answer (32) matching ground truth. No hallucinated information or extraneous details. Navigation score slightly reduced because the response doesn't show explicit evidence of finding and examining the values.yaml file, though the correct answer strongly implies successful file location."
      },
      "composite": 95,
      "notFound": false
    },
    {
      "id": "config-redis-count",
      "category": "configuration",
      "difficulty": "easy",
      "question": "How many top-level configuration keys does the redis chart (at app-common/charts/redis) expose in its values.yaml?",
      "groundTruth": "23",
      "llmAnswer": "\n\nNOT_FOUND\n\nThe redis chart is referenced in the Helm index, but the detailed documentation file containing its values.yaml configuration is not included in the provided reference materials.",
      "filesRead": [
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/contracts/index.md"
      ],
      "plannedFiles": [
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/contracts/index.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI failed to locate or retrieve the values.yaml file from app-common/charts/redis and provided NOT_FOUND instead of the correct answer of 23 top-level configuration keys. While the response avoided hallucinating incorrect information, it demonstrates complete failure in information retrieval and navigation to the required resource."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "interaction-001",
      "category": "interaction",
      "difficulty": "medium",
      "question": "Which Helm charts reference Kubernetes services from other components? List each chart and the service it calls.",
      "groundTruth": "hazelcast (app-common/apps-provider/addons/charts/state-apps/hz) → kubernetes\nhazelcast (app-common/charts/hazelcast) → kubernetes\ngw-metrics (app-common/charts/otel-collector/charts/gw-metrics) → argo-rollouts-metrics\nredis (app-common/charts/redis) → headless",
      "llmAnswer": "Based on the `diagrams/helm-interactions.mmd` file, the following Helm charts reference Kubernetes services from other components:\n\n1. **hazelcast** (`app-common/apps-provider/addons/charts/state-apps/hz`) → references `kubernetes` service\n2. **hazelcast** (`app-common/charts/hazelcast`) → references `kubernetes` service\n3. **gw-metrics** (`app-common/charts/otel-collector/charts/gw-metrics`) → references `argo-rollouts-metrics` service\n4. **redis** (`app-common/charts/redis`) → references `headless` service",
      "filesRead": [
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "reference/system-architecture.md",
        "diagrams/helm-interactions.mmd",
        "reference/subsystems/control-core.md"
      ],
      "plannedFiles": [
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "reference/system-architecture.md",
        "diagrams/helm-interactions.mmd",
        "reference/subsystems/control-core.md"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 5,
        "precision": 5,
        "navigation": 5,
        "notes": "AI answer matches ground truth exactly on all four chart-service references. Correctly identified all paths and service names. Minor stylistic addition of 'service' label doesn't constitute hallucination. Demonstrated proper source navigation by referencing the helm-interactions diagram."
      },
      "composite": 100,
      "notFound": false
    },
    {
      "id": "interaction-002",
      "category": "interaction",
      "difficulty": "hard",
      "question": "Which network ports are used by multiple Helm charts? List each port and the charts that expose it.",
      "groundTruth": "Port 443: cluster, twistlock-defender, eck-operator, twistlock\nPort 8889: gw-metrics, gw-traces, kube-state-metrics\nPort 9200: eck-operator, elasticsearch\nPort 13133: agent, gw-metrics, gw-traces, k8s-metrics, prom-scraper, elasticsearch",
      "llmAnswer": "Based on the contracts documentation provided, the network port used by multiple Helm charts is:\n\n- **port:443** - exposed by:\n  - cluster\n  - eck-operator\n  - twistlock-defender",
      "filesRead": [
        "reference/helm/index.md",
        "reference/system-architecture.md",
        "diagrams/helm-interactions.mmd",
        "reference/contracts/index.md",
        "reference/subsystems/network-core.md"
      ],
      "plannedFiles": [
        "reference/helm/index.md",
        "reference/system-architecture.md",
        "diagrams/helm-interactions.mmd",
        "reference/contracts/index.md",
        "reference/subsystems/network-core.md"
      ],
      "score": {
        "accuracy": 3,
        "completeness": 1,
        "precision": 4,
        "navigation": 2,
        "notes": "AI correctly identified port 443 and three of four associated charts, but missed 'twistlock' from that port's list. More critically, it failed to identify ports 8889, 9200, and 13133 entirely, covering only 25% of the ground truth. No hallucinations present, but navigation was insufficient to comprehensively answer the question."
      },
      "composite": 50,
      "notFound": false
    },
    {
      "id": "resource-001",
      "category": "resource",
      "difficulty": "medium",
      "question": "What are the 10 most common Kubernetes resource types generated across all Foxtrot Helm charts?",
      "groundTruth": "ClusterRole: 51\nServiceAccount: 49\nSecret: 43\nExternalSecret: 38\nService: 36\nRole: 30\nClusterRoleBinding: 29\nDeployment: 29\nConfigMap: 25\nRoleBinding: 25",
      "llmAnswer": "Based on the system architecture documentation, here are the 10 most common Kubernetes resource types across all Foxtrot Helm charts:\n\n1. ClusterRole - 51\n2. ServiceAccount - 49\n3. Secret - 43\n4. ExternalSecret - 38\n5. Service - 36\n6. Role - 30\n7. ClusterRoleBinding - 29\n8. Deployment - 29\n9. ConfigMap - 25\n10. RoleBinding - 25",
      "filesRead": [
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "reference/system-architecture.md",
        "diagrams/helm-interactions.mmd",
        "reference/subsystems/app-common.md"
      ],
      "plannedFiles": [
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "reference/system-architecture.md",
        "diagrams/helm-interactions.mmd",
        "reference/subsystems/app-common.md"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 5,
        "precision": 5,
        "navigation": 4,
        "notes": "AI answer matches ground truth exactly across all 10 items with correct rankings and counts. No hallucinations or extraneous information. Navigation score reduced slightly because the source reference ('system architecture documentation') is generic and doesn't demonstrate specific file discovery, though the perfect accuracy indicates successful information retrieval."
      },
      "composite": 95,
      "notFound": false
    },
    {
      "id": "resource-mdm-app-001",
      "category": "resource",
      "difficulty": "medium",
      "question": "What Kubernetes resource types does the mdm-app chart generate? List each type and count.",
      "groundTruth": "Secret: 5\nJob: 4\nDeployment: 2\nPersistentVolumeClaim: 2\nService: 2\nExternalSecret: 2\nHorizontalPodAutoscaler: 1\nRollout: 1\nPersistentVolume: 1\nPodDisruptionBudget: 1\nServiceAccount: 1",
      "llmAnswer": "\n\nNOT_FOUND\n\nThe documentation provided indicates that the mdm-app chart generates 22 resources total, but does not include a detailed breakdown of the specific Kubernetes resource types and their counts for this chart.",
      "filesRead": [
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "diagrams/helm-interactions.mmd",
        "reference/system-architecture.md"
      ],
      "plannedFiles": [
        "reference/helm/index.md",
        "reference/helm/charts/",
        "reference/contracts/index.md",
        "diagrams/helm-interactions.mmd",
        "reference/system-architecture.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 1,
        "notes": "AI failed to retrieve the required resource type breakdown. While it correctly identified the total count of 22 resources, it did not provide any of the 11 specific resource types and their individual counts from the ground truth. The agent demonstrated minimal navigation capability by finding only aggregate information rather than the detailed breakdown that was available."
      },
      "composite": 30,
      "notFound": true
    },
    {
      "id": "resource-cassandra-001",
      "category": "resource",
      "difficulty": "medium",
      "question": "What Kubernetes resource types does the cassandra chart generate? List each type and count.",
      "groundTruth": "Service: 2\nStatefulSet: 1\nExternalSecret: 1\nClusterSecretStore: 1\nConfigMap: 1\nSecret: 1\nServiceAccount: 1\nRole: 1\nRoleBinding: 1\nClusterRole: 1\nClusterRoleBinding: 1\nCronJob: 1\nJob: 1",
      "llmAnswer": "\n\nLooking at the cassandra chart information in the provided documentation:\n\nFrom the contracts diagram (reference/contracts/index.md), the cassandra chart generates:\n\n- Service: 2\n- StatefulSet: 1\n- CronJob: 1\n- Job: 1\n\nHowever, the chart index shows cassandra has 14 total resources, but the detailed breakdown for the remaining 9 resources is not provided in the documentation files above.\n\n**NOT_FOUND** - Complete resource type breakdown with all 14 resources is not available in the provided documentation.",
      "filesRead": [
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "diagrams/all-contracts.mmd",
        "reference/system-architecture.md"
      ],
      "plannedFiles": [
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "reference/helm/charts/cassandra.md",
        "diagrams/all-contracts.mmd",
        "reference/system-architecture.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI provided partial correct information (4 of 13 resource types with accurate counts) but then claimed the complete breakdown was unavailable in documentation, effectively answering NOT_FOUND. Per scoring rules, this triggers the NOT_FOUND penalty despite partial accuracy in what was provided."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "resource-jenkins-001",
      "category": "resource",
      "difficulty": "medium",
      "question": "What Kubernetes resource types does the jenkins chart generate? List each type and count.",
      "groundTruth": "ConfigMap: 5\nService: 3\nSecret: 3\nIngress: 2\nServiceAccount: 2\nPersistentVolumeClaim: 1\nSecurityGroupPolicy: 1\nPrometheusRule: 1\nBackendConfig: 1\nNetworkPolicy: 1\nPodDisruptionBudget: 1\nPodMonitoring: 1\nRoute: 1\nServiceMonitor: 1\nStatefulSet: 1\nRole: 1\nRoleBinding: 1\nSecretClaim: 1\nPod: 1",
      "llmAnswer": "NOT_FOUND\n\nThe documentation references the jenkins chart at `app-common/charts/jenkins` with 29 total resources, but the detailed chart documentation file that would list the specific Kubernetes resource types is not included in the provided files.",
      "filesRead": [
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "reference/system-architecture.md",
        "diagrams/helm-interactions.mmd"
      ],
      "plannedFiles": [
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "reference/helm/charts/",
        "reference/system-architecture.md",
        "diagrams/helm-interactions.mmd"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI answered NOT_FOUND. Per evaluation criteria, NOT_FOUND responses receive fixed scores: accuracy and completeness are 0 (no facts provided, no coverage of ground truth items), precision is 5 (no hallucinations), and navigation is 0 (failed to locate the required information source)."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "cross-sub-001",
      "category": "cross-subsystem",
      "difficulty": "hard",
      "question": "Which subsystems depend on app-tools, and how heavily (by call+import count)?",
      "groundTruth": "ipam-tools: 6\ncompute-tools: 4\napp-common: 1",
      "llmAnswer": "Based on the Cross-Subsystem Dependencies table in the system architecture document:\n\n- **ipam-tools**: 6 calls + 0 imports = 6 total\n- **compute-tools**: 4 calls + 0 imports = 4 total\n- **app-common**: 1 call + 0 imports = 1 total",
      "filesRead": [
        "reference/contracts/index.md",
        "reference/subsystems/app-tools.md",
        "reference/system-architecture.md",
        "reference/subsystems/app-common.md",
        "diagrams/all-contracts.mmd"
      ],
      "plannedFiles": [
        "reference/contracts/index.md",
        "reference/subsystems/app-tools.md",
        "reference/system-architecture.md",
        "reference/subsystems/app-common.md",
        "diagrams/all-contracts.mmd"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 5,
        "precision": 5,
        "navigation": 5,
        "notes": "AI answer matches ground truth exactly on all three subsystems with correct dependency counts (ipam-tools: 6, compute-tools: 4, app-common: 1). The breakdown into calls and imports adds useful detail without introducing errors. Proper source attribution demonstrates successful navigation to the architecture documentation."
      },
      "composite": 100,
      "notFound": false
    },
    {
      "id": "arch-002",
      "category": "architectural",
      "difficulty": "easy",
      "question": "What is the current version and appVersion of the mdm-app Helm chart?",
      "groundTruth": "version: 0.7.4, appVersion: 2025.12-12-develop",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation shows that mdm-app has version **0.0.2**, but the appVersion is not included in the available reference files. The detailed chart documentation at `charts/app-common-charts-mdm-app.md` would contain this information, but it was not provided.",
      "filesRead": [
        "reference/helm/index.md",
        "reference/system-architecture.md",
        "diagrams/helm-interactions.mmd"
      ],
      "plannedFiles": [
        "reference/helm/index.md",
        "reference/helm/charts/mdm-app.md",
        "reference/helm/charts/index.md",
        "reference/system-architecture.md",
        "diagrams/helm-interactions.mmd"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI answered NOT_FOUND for both version and appVersion. Ground truth specifies version 0.7.4 and appVersion 2025.12-12-develop. While the explanation mentions finding 0.0.2 (which is hallucinated), the primary answer of NOT_FOUND receives the prescribed scoring per instructions."
      },
      "composite": 25,
      "notFound": true
    }
  ]
}