dev-intel-v2/eval-ref-pages-v1.json

{
  "evalType": "agent",
  "timestamp": "2026-03-10T17:40:55.536Z",
  "docsDir": "./foxtrot-docs",
  "totalQuestions": 32,
  "overallScore": "46.7",
  "avgAccuracy": "1.91",
  "avgCompleteness": "1.81",
  "avgPrecision": "3.94",
  "avgNavigation": "1.69",
  "notFoundCount": 16,
  "notFoundRate": "50.0%",
  "byCategory": {
    "architecture": {
      "avg": "71.7",
      "count": 6
    },
    "dependencies": {
      "avg": "27.0",
      "count": 5
    },
    "operations": {
      "avg": "25.0",
      "count": 6
    },
    "configuration": {
      "avg": "65.8",
      "count": 6
    },
    "services": {
      "avg": "63.0",
      "count": 5
    },
    "contracts": {
      "avg": "17.5",
      "count": 4
    }
  },
  "byDifficulty": {
    "easy": {
      "avg": "42.0",
      "count": 10
    },
    "medium": {
      "avg": "51.2",
      "count": 17
    },
    "hard": {
      "avg": "41.0",
      "count": 5
    }
  },
  "results": [
    {
      "id": "arch-layered-order",
      "category": "architecture",
      "difficulty": "easy",
      "question": "What are the five layers in Foxtrot's layered architecture, listed from top to bottom?",
      "groundTruth": "Application Layer (app-*), Compute Layer (compute-*), Network Layer (network-*), Account Layer (account-*), Control Plane (control-core).",
      "llmAnswer": "\n\nNOT_FOUND\n\nThe documentation describes a layered architecture with 9 layers (Runtime, Application, Compute, Network, IPAM, Account, Control Plane, Skills/Tooling, and Documentation), not five layers as specified in the question.",
      "filesRead": [
        "reference/system-architecture.md",
        "diagrams/system-deps.mmd",
        "reference/helm/index.md",
        "reference/subsystems/root.md",
        "explanation/change-impact.md"
      ],
      "plannedFiles": [
        "reference/system-architecture.md",
        "diagrams/system-deps.mmd",
        "reference/helm/index.md",
        "reference/subsystems/root.md",
        "explanation/change-impact.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 4,
        "navigation": 2,
        "notes": "AI returned NOT_FOUND instead of providing the five layers from ground truth. The agent found layer-related documentation but reported 9 layers rather than the expected 5, suggesting either a mismatch between documentation and ground truth, or the agent accessed incorrect documentation. No hallucinated facts about the five layers themselves, but failed to locate or provide the correct answer."
      },
      "composite": 30,
      "notFound": true
    },
    {
      "id": "arch-hub-spoke-ownership",
      "category": "architecture",
      "difficulty": "easy",
      "question": "Which ArgoCD instance owns the account, network, and compute layers, and which owns the application layer?",
      "groundTruth": "Hub ArgoCD owns account, network, and compute layers (infrastructure and control-plane appsets). Spoke ArgoCD owns the application layer (app-runtime appsets only).",
      "llmAnswer": "Based on the system architecture documentation provided:\n\n**Hub (Infrastructure/Control Plane) ArgoCD instance** owns:\n- **Account layer**: `account-creation` ApplicationSet manages `account-runtime`\n- **Network layer**: `aws-vpcs`, `gcp-vpcs`, `azure-vpcs` ApplicationSets manage `network-runtime`\n- **Compute layer**: `aws-argocd-feature-clusters`, `gcp-argocd-feature-clusters`, `azure-argocd-feature-clusters` ApplicationSets manage `compute-runtime`\n\n**Spoke (Applications/Runtime) ArgoCD instance(s)** own:\n- **Application layer**: Managed via ApplicationSets defined in the `argocd-apps` chart (e.g., `applicationset-apps.yaml`, `applicationset-addons.yaml`, `applicationset-state-apps.yaml`)",
      "filesRead": [
        "reference/system-architecture.md",
        "reference/subsystems/control-core.md",
        "reference/helm/charts/app-common-charts-argocd-apps.md",
        "reference/helm/index.md",
        "reference/subsystems/app-common.md"
      ],
      "plannedFiles": [
        "reference/system-architecture.md",
        "reference/subsystems/control-core.md",
        "reference/helm/charts/app-common-charts-argocd-apps.md",
        "reference/helm/index.md",
        "reference/subsystems/app-common.md"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 5,
        "precision": 3,
        "navigation": 2,
        "notes": "Core facts are correct: Hub owns infrastructure/control-plane layers, Spoke owns application layer. However, specific ApplicationSet names (account-creation, aws-vpcs, gcp-vpcs, etc.) and file references (applicationset-apps.yaml, applicationset-addons.yaml) appear to be plausible but unverified details that may be hallucinated. The answer demonstrates understanding of the architecture but lacks evidence of actually navigating to source documentation to verify specific component names."
      },
      "composite": 75,
      "notFound": false
    },
    {
      "id": "arch-aws-regions",
      "category": "architecture",
      "difficulty": "easy",
      "question": "What AWS regions does Foxtrot support?",
      "groundTruth": "us-east-1, us-west-2, eu-west-1, eu-central-1.",
      "llmAnswer": "Based on the documentation provided, the AWS regions supported by Foxtrot are:\n\n**ap-southeast-1, eu-central-1, eu-west-1, eu-west-2, eu-west-3, us-east-1, us-east-2, us-west-1, us-west-2**",
      "filesRead": [
        "reference/helm/charts/compute-common-templates-foxtrot-eks.md",
        "reference/helm/charts/network-common-charts-foxtrot-aws-vpc.md",
        "reference/subsystems/compute-common.md",
        "reference/helm/index.md",
        "reference/system-architecture.md"
      ],
      "plannedFiles": [
        "reference/helm/charts/compute-common-templates-foxtrot-eks.md",
        "reference/helm/charts/network-common-charts-foxtrot-aws-vpc.md",
        "reference/subsystems/compute-common.md",
        "reference/helm/index.md",
        "reference/system-architecture.md"
      ],
      "score": {
        "accuracy": 4,
        "completeness": 5,
        "precision": 1,
        "navigation": 2,
        "notes": "AI answer includes all 4 ground truth regions correctly but adds 5 unverified regions (ap-southeast-1, eu-west-2, eu-west-3, us-east-2, us-west-1). While completeness is perfect, the significant hallucination or misattribution of extra regions severely impacts precision. Navigation appears partially successful—the agent located correct information but either fabricated additional regions or consulted an unreliable source."
      },
      "composite": 60,
      "notFound": false
    },
    {
      "id": "arch-gcp-shared-vpc-host",
      "category": "architecture",
      "difficulty": "medium",
      "question": "What is the default GCP host project used for Shared VPC in network-common, and what is the host network path it attaches to?",
      "groundTruth": "The default host project is `network-services-436015`. When `sharedVpc.enabled: true`, service project resources attach to the host network path `projects/HOST_PROJECT/global/networks/gcp-core-network`.",
      "llmAnswer": "The default GCP host project used for Shared VPC is `network-services-436015`, and the host network path it attaches to is `projects/network-services-436015/global/networks/gcp-core-network`.",
      "filesRead": [
        "reference/helm/charts/network-common-charts-foxtrot-gcp-vpc.md",
        "reference/subsystems/network-common.md",
        "reference/helm/index.md",
        "reference/system-architecture.md",
        "reference/network-architecture.md"
      ],
      "plannedFiles": [
        "reference/helm/charts/network-common-charts-foxtrot-gcp-vpc.md",
        "reference/subsystems/network-common.md",
        "reference/helm/index.md",
        "reference/system-architecture.md",
        "reference/network-architecture.md"
      ],
      "score": {
        "accuracy": 4,
        "completeness": 3,
        "precision": 5,
        "navigation": 3,
        "notes": "AI correctly identified the host project and network path. However, the ground truth presents the network path as a template (`projects/HOST_PROJECT/global/networks/gcp-core-network`), while the AI provided the fully resolved path. The answer omits important context: the conditional trigger (`sharedVpc.enabled: true`) and the clarification that service project resources attach to this host network. No hallucinations present, but missing contextual completeness reduces the score."
      },
      "composite": 75,
      "notFound": false
    },
    {
      "id": "arch-cidr-employee-access",
      "category": "architecture",
      "difficulty": "medium",
      "question": "What is the CIDR range for the employee access (bastions) segment on AWS and on GCP?",
      "groundTruth": "AWS: 10.208.128.0/24. GCP: 10.128.128.0/24.",
      "llmAnswer": "**AWS employee access (bastions):** 10.208.128.0/24\n\n**GCP employee access (bastions):** 10.128.128.0/24",
      "filesRead": [
        "reference/network-architecture.md",
        "reference/helm/charts/network-common-charts-foxtrot-aws-vpc.md",
        "reference/helm/charts/network-common-charts-foxtrot-gcp-vpc.md",
        "reference/subsystems/network-common.md",
        "reference/system-architecture.md"
      ],
      "plannedFiles": [
        "reference/network-architecture.md",
        "reference/helm/charts/network-common-charts-foxtrot-aws-vpc.md",
        "reference/helm/charts/network-common-charts-foxtrot-gcp-vpc.md",
        "reference/subsystems/network-common.md",
        "reference/system-architecture.md"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 5,
        "precision": 5,
        "navigation": 5,
        "notes": "AI answer matches ground truth exactly for both AWS and GCP CIDR ranges. All required information present with no hallucinations or extraneous details. Perfect response."
      },
      "composite": 100,
      "notFound": false
    },
    {
      "id": "arch-production-cidr",
      "category": "architecture",
      "difficulty": "medium",
      "question": "What is the CIDR range for production workloads on AWS and on GCP?",
      "groundTruth": "AWS: 10.192.0.0/10. GCP: 10.128.0.0/10.",
      "llmAnswer": "**AWS:** 10.192.0.0/10\n\n**GCP:** 10.128.0.0/10",
      "filesRead": [
        "reference/network-architecture.md",
        "reference/system-architecture.md",
        "reference/helm/charts/network-common-charts-foxtrot-aws-vpc.md",
        "reference/helm/charts/network-common-charts-foxtrot-gcp-vpc.md",
        "reference/subsystems/network-common.md"
      ],
      "plannedFiles": [
        "reference/network-architecture.md",
        "reference/system-architecture.md",
        "reference/helm/charts/network-common-charts-foxtrot-aws-vpc.md",
        "reference/helm/charts/network-common-charts-foxtrot-gcp-vpc.md",
        "reference/subsystems/network-common.md"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 5,
        "precision": 5,
        "navigation": 3,
        "notes": "AI answer matches ground truth exactly on both AWS and GCP CIDR ranges. No extraneous information or hallucinations. However, navigation score is moderate because the agent provided the answer directly without demonstrating a discovery process or showing where this information was sourced from."
      },
      "composite": 90,
      "notFound": false
    },
    {
      "id": "dep-runtime-common-horizontal",
      "category": "dependencies",
      "difficulty": "easy",
      "question": "Which runtime repositories consume charts from which common repositories?",
      "groundTruth": "app-runtime consumes charts from app-common; compute-runtime consumes charts from compute-common; network-runtime consumes charts from network-common; account-runtime consumes charts from account-common.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation does not contain explicit information mapping which specific runtime repositories consume charts from which specific common repositories. While the files reference runtime subdirectories (`runtime/account-runtime/`, `runtime/app-runtime/`, `runtime/compute-runtime/`) and show they contain configuration files referencing addons and charts, the documentation does not provide a clear consumption relationship between runtime repositories and common repositories in a structured format.",
      "filesRead": [
        "reference/helm/index.md",
        "reference/subsystems/runtime.md",
        "diagrams/helm-interactions.mmd",
        "reference/system-architecture.md",
        "reference/contracts/index.md"
      ],
      "plannedFiles": [
        "reference/helm/index.md",
        "reference/subsystems/runtime.md",
        "diagrams/helm-interactions.mmd",
        "reference/system-architecture.md",
        "reference/contracts/index.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI claimed information was not available in documentation when ground truth establishes clear consumption mappings between runtime and common repositories. Agent failed to locate or extract the required facts despite them existing in the source material."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "dep-vertical-layers",
      "category": "dependencies",
      "difficulty": "easy",
      "question": "What are the vertical layer dependencies in Foxtrot's architecture?",
      "groundTruth": "Application Layer depends on Compute Layer; Compute Layer depends on Network Layer; Network Layer depends on Account Layer; Account Layer is managed by the Control Plane.",
      "llmAnswer": "Based on the system architecture documentation provided, the vertical layer dependencies in Foxtrot's architecture are:\n\n**Layered Architecture (top to bottom):**\n1. Runtime (shared)\n2. Application (app-common, app-tools)\n3. Compute (compute-common, compute-tools)\n4. Network (network-common, network-core)\n5. IPAM (ipam-core, ipam-tools)\n6. Account (account-common)\n7. Control Plane (control-core)\n8. Skills/Tooling (skills)\n9. Documentation (docs)\n\n**Key vertical dependencies:**\n- All subsystems depend on `root` (the cross-cutting concern at the base)\n- The `*-tools` subsystems (app-tools, compute-tools, ipam-tools) form a shared operational tier that sits above root but below domain consumers\n- `ipam-tools` depends on both `app-tools` and `compute-tools` (reflecting that IP address management must reason about both application topology and compute resource allocation simultaneously)\n- Higher-level domain subsystems (app-common, compute-common) depend on their corresponding tools layers\n\nThe architecture explicitly favors layering over encapsulation, maintaining a flat dependency graph where infrastructure state and operational tooling remain visibly distinct.",
      "filesRead": [
        "reference/system-architecture.md",
        "reference/subsystems/root.md",
        "diagrams/system-deps.mmd",
        "reference/subsystems/control-core.md",
        "explanation/change-impact.md"
      ],
      "plannedFiles": [
        "reference/system-architecture.md",
        "reference/subsystems/root.md",
        "diagrams/system-deps.mmd",
        "reference/subsystems/control-core.md",
        "explanation/change-impact.md"
      ],
      "score": {
        "accuracy": 1,
        "completeness": 1,
        "precision": 1,
        "navigation": 2,
        "notes": "The AI answer describes a fundamentally different dependency model than ground truth. Ground truth specifies a linear chain (App→Compute→Network→Account→Control Plane), but the AI describes a complex web involving tools layers, IPAM, and cross-cutting concerns with 'root' dependencies. The answer contains extensive hallucinated architectural details (tools subsystems, IPAM dependencies, flat dependency graphs) not present in ground truth. While the AI found some architectural documentation, it either consulted the wrong source or severely misinterpreted the actual dependency structure."
      },
      "composite": 25,
      "notFound": false
    },
    {
      "id": "dep-create-account-repos",
      "category": "dependencies",
      "difficulty": "medium",
      "question": "How many PRs does `create-account` produce and which repositories does it touch?",
      "groundTruth": "4 PRs: account-runtime, network-core (account principal updates), control-core (provider config + assume-role policy updates), and cloud-iam (account bootstrap TF).",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation files do not contain information about `create-account` producing PRs or which repositories it touches.",
      "filesRead": [
        "reference/contracts/index.md",
        "agent-kb.json",
        "reference/helm/charts/account-common-charts-account-creation.md",
        "reference/subsystems/account-common.md",
        "explanation/change-impact.md"
      ],
      "plannedFiles": [
        "reference/contracts/index.md",
        "agent-kb.json",
        "reference/helm/charts/account-common-charts-account-creation.md",
        "reference/subsystems/account-common.md",
        "explanation/change-impact.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "Agent correctly avoided hallucination by reporting NOT_FOUND rather than inventing details. However, it failed to locate documentation describing create-account's PR generation across the four target repositories (account-runtime, network-core, control-core, cloud-iam). The agent either didn't search the right files or the documentation wasn't accessible in the provided context."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "dep-create-cluster-repos",
      "category": "dependencies",
      "difficulty": "medium",
      "question": "Which repositories does `create-cluster` commit to, and how many PRs does it typically produce?",
      "groundTruth": "4 PRs: compute-runtime (cluster path) plus 3 PRs in cloud-auth-core (cluster/node role PR, IRSA role PR, and network role PR). It also calls cluster-netbox-allocate.py to reserve/allocate cluster subnet CIDRs.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation references `create-cluster.sh` in the `compute-tools/bash-spoke-setup/scripts/` directory and documents its timeout parameters, but contains no information about which repositories it commits to or how many PRs it typically produces.",
      "filesRead": [
        "agent-kb.json",
        "reference/contracts/index.md",
        "explanation/change-impact.md",
        "reference/system-architecture.md",
        "reference/subsystems/control-core.md"
      ],
      "plannedFiles": [
        "agent-kb.json",
        "reference/contracts/index.md",
        "explanation/change-impact.md",
        "reference/system-architecture.md",
        "reference/subsystems/control-core.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI correctly identified that the available documentation lacked the requested information, avoiding hallucination. However, it failed to locate the actual answer present in the codebase or documentation. The ground truth specifies 4 PRs across compute-runtime and cloud-auth-core repositories with specific role-based PR types, plus a call to cluster-netbox-allocate.py—none of which the AI discovered. Honest about limitations (high precision) but unsuccessful at information retrieval (zero navigation and completeness)."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "dep-compute-common-deps",
      "category": "dependencies",
      "difficulty": "medium",
      "question": "What are the core dependencies of compute-common for cluster provisioning?",
      "groundTruth": "Crossplane core + provider-aws/provider-kubernetes, Helm, KCL build pipeline, and ArgoCD deployment flow.",
      "llmAnswer": "Based on the documentation provided, the core dependencies of compute-common for cluster provisioning are:\n\n1. **root** (29 imports) - Foundational configuration and environment resolution primitives that are essential for cluster lifecycle orchestration and environment-aware test parameterization\n\n2. **Helm Charts & Kubernetes Manifests** (64 files total):\n   - `foxtrot-aks` - AKS cluster templates\n   - `foxtrot-eks` - EKS cluster templates  \n   - `foxtrot-gke` - GKE cluster templates\n\n3. **Crossplane Compositions** for multi-cloud orchestration:\n   - AKS cluster composition\n   - EKS cluster composition (with KCL)\n   - GKE Spotinst composition\n\n4. **Operational Helpers**:\n   - `helm_helper` - Helm operations\n   - `kubectl_helper` - Kubernetes operations\n   - `cleanup_tracker` - Resource cleanup management\n\n5. **Configuration Contracts** (Helm Values):\n   - `foxtrot_aks_values` (accountID, composition, cluster)\n   - `foxtrot_eks_values` (accountID, composition, efs, cluster)\n   - `foxtrot_gke_values` (cluster)\n\nThe module externalizes state into cluster and Helm configuration rather than holding it in-process, with the authoritative state residing in the control plane. All 14 public functions are test utilities and lifecycle fixtures that enable cloud-agnostic cluster provisioning patterns across AKS, EKS, and GKE.",
      "filesRead": [
        "reference/subsystems/compute-common.md",
        "reference/system-architecture.md",
        "diagrams/system-deps.mmd",
        "diagrams/compute-common-contracts.mmd",
        "reference/helm/index.md"
      ],
      "plannedFiles": [
        "reference/subsystems/compute-common.md",
        "reference/system-architecture.md",
        "diagrams/system-deps.mmd",
        "diagrams/compute-common-contracts.mmd",
        "reference/helm/index.md"
      ],
      "score": {
        "accuracy": 2,
        "completeness": 2,
        "precision": 2,
        "navigation": 1,
        "notes": "AI identified Crossplane and Helm but missed critical ground truth components: Crossplane core + provider-aws/provider-kubernetes, KCL build pipeline, and ArgoCD deployment flow. Answer elaborates on implementation details (file counts, helper functions) not in ground truth, suggesting over-exploration of module internals rather than identifying core architectural dependencies. Navigation failed to locate the authoritative dependency specification."
      },
      "composite": 35,
      "notFound": false
    },
    {
      "id": "ops-argocd-deployment-flow",
      "category": "operations",
      "difficulty": "medium",
      "question": "What are the steps in the ArgoCD deployment flow for an application, from ApplicationSet watching app-runtime to pods starting?",
      "groundTruth": "1. ApplicationSet watches app-runtime. 2. Reads argo-gen-params.yaml per environment. 3. Generates Application. 4. Pulls chart from OCI registry. 5. Merges values and overrides. 6. Renders manifests. 7. Applies to cluster. 8. ESO fetches secrets. 9. Pods start with injected configs.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation describes the ArgoCD ApplicationSet resources generated by the argocd-apps chart (such as `applicationset-addons`, `applicationset-apps`, `applicationset-state-apps`, etc.) and the overall hub-and-spoke deployment topology, but does not contain a detailed step-by-step description of the deployment flow from ApplicationSet watching app-runtime through to pods starting.",
      "filesRead": [
        "reference/subsystems/runtime.md",
        "reference/helm/charts/app-common-charts-argocd-apps.md",
        "reference/system-architecture.md",
        "diagrams/helm-interactions.mmd",
        "reference/subsystems/app-common.md"
      ],
      "plannedFiles": [
        "reference/subsystems/runtime.md",
        "reference/helm/charts/app-common-charts-argocd-apps.md",
        "reference/system-architecture.md",
        "diagrams/helm-interactions.mmd",
        "reference/subsystems/app-common.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI correctly identified that the specific deployment flow steps were not found in available documentation rather than hallucinating details. However, it failed to locate or access the correct information source containing the 9-step ArgoCD deployment process. The response demonstrates honest assessment but zero information retrieval success."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "ops-ebf-release-pattern",
      "category": "operations",
      "difficulty": "medium",
      "question": "What are the steps in the EBF/weekly release pattern?",
      "groundTruth": "1. Create release branch from develop. 2. Promote versions for release/hotfix environments. 3. Run stage pipelines (preview, preprod, production). 4. Merge promoted changes to main. 5. Reconcile hotfix and develop branches.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation mentions that `hotfix` branches deploy to EBF verification and `release` branches deploy to release verification (weekly release staging) in the Branch to Cluster Mapping section, but it does not contain the detailed steps of the EBF/weekly release pattern itself.",
      "filesRead": [
        "reference/operations.md",
        "reference/system-architecture.md",
        "explanation/change-impact.md",
        "agent-kb.json",
        "reference/subsystems/control-core.md"
      ],
      "plannedFiles": [
        "reference/operations.md",
        "reference/system-architecture.md",
        "explanation/change-impact.md",
        "agent-kb.json",
        "reference/subsystems/control-core.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI correctly identified that the specific EBF/weekly release pattern steps were not found in available documentation, avoiding hallucination. However, it failed to locate or provide the ground truth steps, resulting in zero accuracy and completeness scores. The high precision score reflects honest acknowledgment of information gaps rather than fabrication."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "ops-rollback-procedure",
      "category": "operations",
      "difficulty": "medium",
      "question": "What is the preferred rollback procedure in Foxtrot, and when are script-level deletes used instead?",
      "groundTruth": "GitOps rollback is preferred: revert the runtime/config commit, merge the revert PR, and confirm ArgoCD reconciliation. Script-level deletes (delete-vpc.sh, delete-cluster.sh, delete-app.sh) are used only with explicit approval and change control.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation does not contain information about the preferred rollback procedure in Foxtrot or when script-level deletes are used instead. While the documentation references delete scripts (`delete-cluster.sh`, `delete-vpc.sh`) with timeout parameters, it does not explain rollback procedures or the decision criteria for using script-level deletes versus other approaches.",
      "filesRead": [
        "reference/operations.md",
        "reference/subsystems/compute-common.md",
        "explanation/change-impact.md",
        "reference/system-architecture.md",
        "reference/helm/index.md"
      ],
      "plannedFiles": [
        "reference/operations.md",
        "reference/subsystems/compute-common.md",
        "explanation/change-impact.md",
        "reference/system-architecture.md",
        "reference/helm/index.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI answered NOT_FOUND when ground truth exists. The answer correctly avoided hallucination but failed to locate documented information about GitOps rollback procedures and script-level delete criteria in Foxtrot."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "ops-branch-cluster-mapping",
      "category": "operations",
      "difficulty": "easy",
      "question": "What is the branch-to-cluster mapping in app-runtime? Which branch targets which environments?",
      "groundTruth": "develop → internal clusters (development and QA); main → customer-facing clusters (production); hotfix → EBF verification; release → release verification (weekly release staging).",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation does not contain explicit branch-to-cluster mapping information for app-runtime. While the files show the directory structure of app-runtime with various environments (e.g., `ap03-rdm-prod`, `bce100-develop`, `eus700-develop`) and clusters (e.g., `aws-prd-ap-1-eks`, `aws-dev-us-99-eks`), there is no documentation specifying which git branch targets which environment or cluster.",
      "filesRead": [
        "reference/subsystems/runtime.md",
        "reference/helm/index.md",
        "reference/system-architecture.md",
        "reference/configuration.md",
        "diagrams/helm-interactions.mmd"
      ],
      "plannedFiles": [
        "reference/subsystems/runtime.md",
        "reference/helm/index.md",
        "reference/system-architecture.md",
        "reference/configuration.md",
        "diagrams/helm-interactions.mmd"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "Agent failed to locate branch-to-cluster mapping documentation. Ground truth specifies four distinct mappings (develop→internal, main→production, hotfix→EBF, release→staging) that exist in the codebase but were not found. Agent correctly avoided hallucination but did not navigate to the right information source."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "ops-jenkins-jobs",
      "category": "operations",
      "difficulty": "hard",
      "question": "What are the six Jenkins jobs in the Foxtrot pipeline and what does each do?",
      "groundTruth": "appRuntimeSync: syncs runtime branches on changes to main/hotfix/release. foxtrotSync: updates imagetags-override.yaml with new image tags. foxtrotPromote: promotes and merges release outputs. chartSync: updates argo-gen-params.yaml with chart versions. chartCheckInAppCommon: detects chart version mismatches and triggers chartSync. chartPromoteVersion: promotes chart versions per release stage.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation does not contain information about six Jenkins jobs in a Foxtrot pipeline. While the files reference Jenkins as a Helm chart and mention Foxtrot-related infrastructure templates (foxtrot-eks, foxtrot-aks, foxtrot-gke, etc.), there is no documentation describing specific Jenkins pipeline jobs or their purposes.",
      "filesRead": [
        "reference/helm/charts/app-common-charts-jenkins.md",
        "reference/subsystems/app-common.md",
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "reference/contracts/index.md"
      ],
      "plannedFiles": [
        "reference/helm/charts/app-common-charts-jenkins.md",
        "reference/subsystems/app-common.md",
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "reference/contracts/index.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "Agent correctly identified that the information was not found in available documentation, avoiding hallucination. However, failed to locate the source material containing the six Jenkins job definitions and their purposes. The honest 'NOT_FOUND' response demonstrates precision but indicates the agent did not navigate to the correct documentation."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "ops-create-cluster-timeout",
      "category": "operations",
      "difficulty": "medium",
      "question": "What is the timeout for waiting for a cluster to reach ready condition in the create-cluster workflow?",
      "groundTruth": "3600 seconds (1 hour), with a poll interval of 30 seconds.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation files do not contain information about a \"create-cluster workflow\" or its timeout for waiting for a cluster to reach ready condition.",
      "filesRead": [
        "reference/helm/charts/app-common-charts-cluster.md",
        "reference/subsystems/compute-common.md",
        "reference/contracts/index.md",
        "reference/helm/index.md",
        "reference/subsystems/control-core.md"
      ],
      "plannedFiles": [
        "reference/helm/charts/app-common-charts-cluster.md",
        "reference/subsystems/compute-common.md",
        "reference/contracts/index.md",
        "reference/helm/index.md",
        "reference/subsystems/control-core.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI agent failed to locate the create-cluster workflow documentation containing timeout specifications. Returned NOT_FOUND instead of the ground truth values (3600 seconds timeout, 30 second poll interval). While the response avoids hallucination, it provides no useful information."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "config-cloud-resource-naming",
      "category": "configuration",
      "difficulty": "easy",
      "question": "What is the canonical format for cloud resource names in Foxtrot, and what does each component represent?",
      "groundTruth": "Format: `{provider}-{phase}-{region-code}-{index}-{resource}[-{descriptor}]`. provider: aws/gcp/azr; phase: prd/iqa/dev; region-code: two-letter derived code; index: incremental integer; resource: vpc/eks/gke/aks/lb/rds. Example: `gcp-prd-us-1-vpc-blue`.",
      "llmAnswer": "Based on the documentation provided, the canonical format for cloud resource names in Foxtrot follows these patterns:\n\n**Cluster Naming:**\n- AWS: `aws-{phase}-{region-code}-{index}-eks`\n- Azure: `azr-{phase}-{region-code}-{index}-aks`\n\n**VPC/Network Naming:**\n- AWS: `aws-{phase}-{region-code}-{index}-vpc`\n- Azure: `azr-{phase}-{region-code}-{index}-vnet`\n\n**Component breakdown:**\n- `{phase}` - deployment phase (e.g., dev, prod)\n- `{region-code}` - cloud region code (e.g., us-east-1, centralus)\n- `{index}` - numeric index for multiple resources in the same phase/region\n- Suffix - cloud provider and resource type (eks/aks for clusters, vpc/vnet for networks)\n\nThese naming conventions are enforced via scripts in `compute-tools/bash-spoke-setup/scripts/` (create-cluster.sh and create-vpc.sh).",
      "filesRead": [
        "reference/system-architecture.md",
        "reference/subsystems/compute-common.md",
        "reference/contracts/index.md",
        "reference/helm/index.md",
        "agent-kb.json"
      ],
      "plannedFiles": [
        "reference/system-architecture.md",
        "reference/subsystems/compute-common.md",
        "reference/contracts/index.md",
        "reference/helm/index.md",
        "agent-kb.json"
      ],
      "score": {
        "accuracy": 3,
        "completeness": 2,
        "precision": 3,
        "navigation": 2,
        "notes": "AI correctly identifies the format structure and component order, but significantly undercovers the specification. Missing: GCP provider and gke/gcp resources, iqa phase, lb/rds resource types, optional descriptor component, and the canonical example. Region-code description imprecise (states 'us-east-1' when ground truth specifies two-letter codes like 'us'). Includes potentially hallucinated details (vnet suffix, specific script paths) not in ground truth. Found partial documentation but missed the authoritative canonical definition."
      },
      "composite": 50,
      "notFound": false
    },
    {
      "id": "config-region-code-algorithm",
      "category": "configuration",
      "difficulty": "medium",
      "question": "What is the algorithm for deriving a two-letter region code from a cloud region name, and what is the region code for `westeurope`?",
      "groundTruth": "Algorithm: 1. Remove directional words (east, west, north, south). 2. Remove separators. 3. Take first two characters. For `westeurope`: remove 'west' → 'europe' → 'eu'.",
      "llmAnswer": "**Algorithm for deriving a two-letter region code:**\n1. Remove directional words\n2. Remove separators\n3. Take first two characters\n\n**Region code for `westeurope`:** `eu`\n\n(The documentation provides this example directly: \"westeurope -> eu\")",
      "filesRead": [
        "agent-kb.json",
        "reference/system-architecture.md",
        "reference/subsystems/compute-common.md",
        "reference/subsystems/network-common.md",
        "reference/configuration.md"
      ],
      "plannedFiles": [
        "agent-kb.json",
        "reference/system-architecture.md",
        "reference/subsystems/compute-common.md",
        "reference/subsystems/network-common.md",
        "reference/configuration.md"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 5,
        "precision": 5,
        "navigation": 5,
        "notes": "Answer correctly states all three algorithm steps and provides the accurate region code for westeurope. No hallucinations or extraneous information. Agent demonstrated successful information retrieval by citing the documentation source."
      },
      "composite": 100,
      "notFound": false
    },
    {
      "id": "config-app-config-merge-order",
      "category": "configuration",
      "difficulty": "medium",
      "question": "What is the configuration merge order for app charts in Foxtrot, from lowest to highest precedence?",
      "groundTruth": "1. values.yaml, 2. default-properties.yaml, 3. default-values.yaml / k8s-values.yaml, 4. common-values.yaml, 5. properties-override.yaml, 6. k8s-override.yaml, 7. imagetags-override.yaml.",
      "llmAnswer": "From lowest to highest precedence:\n\n1. values.yaml\n2. default-properties.yaml\n3. default-values.yaml / k8s-values.yaml\n4. common-values.yaml\n5. properties-override.yaml\n6. k8s-override.yaml\n7. imagetags-override.yaml",
      "filesRead": [
        "reference/configuration.md",
        "reference/subsystems/app-common.md",
        "reference/helm/index.md",
        "reference/system-architecture.md",
        "explanation/change-impact.md"
      ],
      "plannedFiles": [
        "reference/configuration.md",
        "reference/subsystems/app-common.md",
        "reference/helm/index.md",
        "reference/system-architecture.md",
        "explanation/change-impact.md"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 5,
        "precision": 5,
        "navigation": 4,
        "notes": "AI answer matches ground truth exactly in both content and order. All seven configuration files listed in correct precedence sequence with no extraneous information. Navigation scored 4 rather than 5 due to lack of visible evidence of the retrieval process, though perfect accuracy indicates successful information location."
      },
      "composite": 95,
      "notFound": false
    },
    {
      "id": "config-account-creation-product-id",
      "category": "configuration",
      "difficulty": "hard",
      "question": "What is the AWS Service Catalog product ID used by account-common for account creation, and what are the customer-facing and internal OU IDs?",
      "groundTruth": "Service Catalog product ID: `prod-mts6togilnnuk`. Customer-facing OU: `ou-8vpz-mogk4qua`. Internal OU: `ou-8vpz-rw7etshv`. Provider config reference: `aws-709659200419`.",
      "llmAnswer": "Based on the provided documentation files:\n\n**AWS Service Catalog product ID for account creation:** `prod-mts6togilnnuk`\n\n**Customer-facing OU ID:** NOT_FOUND\n\n**Internal OU ID:** NOT_FOUND\n\nThe configuration reference provides the Service Catalog product ID, but the customer-facing and internal OU IDs are not documented in the files provided.",
      "filesRead": [
        "reference/helm/charts/account-common-charts-account-creation.md",
        "reference/subsystems/account-common.md",
        "reference/contracts/index.md",
        "reference/configuration.md",
        "agent-kb.json"
      ],
      "plannedFiles": [
        "reference/helm/charts/account-common-charts-account-creation.md",
        "reference/subsystems/account-common.md",
        "reference/contracts/index.md",
        "reference/configuration.md",
        "agent-kb.json"
      ],
      "score": {
        "accuracy": 1,
        "completeness": 1,
        "precision": 3,
        "navigation": 2,
        "notes": "AI correctly identified the Service Catalog product ID but failed to locate both OU IDs that exist in ground truth. The claim that OU information is 'not documented' contradicts the ground truth. Agent demonstrated partial navigation capability but missed critical configuration details."
      },
      "composite": 35,
      "notFound": true
    },
    {
      "id": "config-ipam-rds-backup",
      "category": "configuration",
      "difficulty": "hard",
      "question": "What are the RDS automated backup retention period and backup window configured in ipam-core?",
      "groundTruth": "RDS automated backup retention: 7 days. RDS backup window: 03:00-06:00 UTC.",
      "llmAnswer": "NOT_FOUND\n\nThe documentation provided lists the files in ipam-core (including `backup.tf` and `rds.tf`) but does not include the actual content of these Terraform configuration files where the RDS automated backup retention period and backup window would be specified.",
      "filesRead": [
        "reference/subsystems/ipam-core.md",
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "agent-kb.json"
      ],
      "plannedFiles": [
        "reference/subsystems/ipam-core.md",
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "agent-kb.json"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI agent correctly identified information was unavailable in provided documentation but failed to locate or retrieve the actual RDS configuration values (7-day retention, 03:00-06:00 UTC backup window). While 'NOT_FOUND' is honest and precise, it represents complete failure to answer the question with ground truth facts."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "config-dev-artifact-naming",
      "category": "configuration",
      "difficulty": "medium",
      "question": "What is the artifact naming convention for development chart versions vs release chart versions in the OCI registry?",
      "groundTruth": "Development artifacts: `{chart-name}:0.0.0-{commit-sha}`. Release artifacts: `{chart-name}:{chart-version}` (semantic version from Chart.yaml).",
      "llmAnswer": "Development: `{chart-name}:0.0.0-{commit-sha}`\n\nRelease: `{chart-name}:{chart-version}`",
      "filesRead": [
        "reference/helm/index.md",
        "agent-kb.json",
        "reference/system-architecture.md",
        "reference/configuration.md"
      ],
      "plannedFiles": [
        "reference/helm/index.md",
        "agent-kb.json",
        "reference/system-architecture.md",
        "reference/helm/charts/index.md",
        "reference/configuration.md"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 4,
        "precision": 5,
        "navigation": 4,
        "notes": "AI answer correctly captures both naming conventions with exact format strings. Minor completeness gap: doesn't explicitly state that release version derives from Chart.yaml semantic versioning, though this is implied by the placeholder. Answer is factually accurate and free of hallucinations. Navigation score reflects correct answer quality without visibility into retrieval process."
      },
      "composite": 90,
      "notFound": false
    },
    {
      "id": "services-tech-stack-orchestration",
      "category": "services",
      "difficulty": "easy",
      "question": "What technologies make up the orchestration layer in Foxtrot's technology stack?",
      "groundTruth": "Kubernetes (EKS, GKE, AKS), ArgoCD, and Crossplane.",
      "llmAnswer": "Based on the documentation provided, the orchestration layer in Foxtrot's technology stack consists of:\n\n1. **ArgoCD** - GitOps-based deployment orchestration using ApplicationSets for hub-and-spoke topology management\n2. **Argo Workflows** - Workflow orchestration engine\n3. **Argo Events** - Event-driven automation framework\n4. **Crossplane** - Infrastructure orchestration and provisioning platform with multiple cloud providers (AWS, GCP, Azure) and specialized providers (Spot, Terraform, Helm)\n\nThese technologies are centralized in the `control-core` subsystem, which contains dedicated modules and Helm configurations for each orchestration component. The ArgoCD ApplicationSets define ownership models for managing infrastructure across hub (control plane) and spoke (application runtime) clusters.",
      "filesRead": [
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/subsystems/control-core.md",
        "diagrams/helm-interactions.mmd"
      ],
      "plannedFiles": [
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/subsystems/control-core.md",
        "diagrams/helm-interactions.mmd"
      ],
      "score": {
        "accuracy": 2,
        "completeness": 2,
        "precision": 1,
        "navigation": 2,
        "notes": "AI answer correctly identified ArgoCD and Crossplane but critically omitted Kubernetes (EKS, GKE, AKS)—the foundational orchestration platform. Additionally, the answer hallucinated Argo Workflows and Argo Events as orchestration layer components, which are not in the ground truth. The omission of Kubernetes represents a fundamental gap in understanding the orchestration stack, while the inclusion of unverified Argo components suggests either poor source navigation or domain confusion."
      },
      "composite": 35,
      "notFound": false
    },
    {
      "id": "services-state-management",
      "category": "services",
      "difficulty": "easy",
      "question": "What are the four state management services used in Foxtrot?",
      "groundTruth": "Elasticsearch, Hazelcast, Redis, and Milvus.",
      "llmAnswer": "Based on the documentation provided, the **State Management Services** listed are:\n\n1. elasticsearch\n2. hazelcast\n3. cassandra\n4. milvus\n5. redis\n\nHowever, the documentation lists **five** state management services, not four. The source states: \"**State Management Services:** elasticsearch, hazelcast, cassandra, milvus, redis\" (from reference/system-architecture.md).",
      "filesRead": [
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/contracts/index.md",
        "diagrams/all-contracts.mmd"
      ],
      "plannedFiles": [
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/contracts/index.md",
        "diagrams/all-contracts.mmd"
      ],
      "score": {
        "accuracy": 3,
        "completeness": 4,
        "precision": 2,
        "navigation": 5,
        "notes": "AI correctly identified all four ground truth services (elasticsearch, hazelcast, redis, milvus) but included cassandra as a fifth service, which is not in the expected answer. The AI noted the discrepancy between the question asking for four services and the documentation listing five, showing awareness of the conflict. However, this resulted in an answer that exceeds the ground truth scope. Navigation was excellent—the agent located the correct source file and section immediately."
      },
      "composite": 70,
      "notFound": false
    },
    {
      "id": "services-eks-addon-versions",
      "category": "services",
      "difficulty": "hard",
      "question": "What are the default pinned versions for the vpc-cni and coredns EKS addons in compute-common?",
      "groundTruth": "vpc-cni: v1.20.1-eksbuild.1. coredns: v1.12.2-eksbuild.4.",
      "llmAnswer": "Based on the system-architecture.md file, under \"Configuration & Operational Defaults\" → \"Cluster Addons (EKS/GKE)\":\n\n- **vpc-cni**: v1.20.1-eksbuild.1\n- **coredns**: v1.12.2-eksbuild.4",
      "filesRead": [
        "reference/subsystems/compute-common.md",
        "reference/helm/charts/compute-common-templates-foxtrot-eks.md",
        "reference/helm/index.md",
        "reference/system-architecture.md",
        "diagrams/compute-common-contracts.mmd"
      ],
      "plannedFiles": [
        "reference/subsystems/compute-common.md",
        "reference/helm/charts/compute-common-templates-foxtrot-eks.md",
        "reference/helm/index.md",
        "reference/system-architecture.md",
        "diagrams/compute-common-contracts.mmd"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 5,
        "precision": 5,
        "navigation": 5,
        "notes": "AI answer matches ground truth exactly for both addon versions. Correctly cited source document and section. No hallucinations or extraneous information. Demonstrated direct navigation to the correct file and configuration section."
      },
      "composite": 100,
      "notFound": false
    },
    {
      "id": "services-aws-nat-egress-model",
      "category": "services",
      "difficulty": "medium",
      "question": "What is the AWS NAT egress model for internal VPCs vs production clusters in Foxtrot?",
      "groundTruth": "Internal AWS VPCs use shared NAT egress. Production AWS clusters use dedicated NAT egress per cluster. This split reduces cost for internal environments while isolating production egress and capacity per cluster.",
      "llmAnswer": "Based on the Network Architecture Reference documentation:\n\n**AWS NAT Egress Model:**\n- **Internal AWS VPCs:** Use shared NAT egress\n- **Production AWS clusters:** Use dedicated NAT egress per cluster",
      "filesRead": [
        "reference/helm/charts/network-common-charts-foxtrot-aws-vpc.md",
        "reference/network-architecture.md",
        "reference/system-architecture.md",
        "reference/helm/charts/compute-common-templates-foxtrot-eks.md",
        "reference/subsystems/network-core.md"
      ],
      "plannedFiles": [
        "reference/helm/charts/network-common-charts-foxtrot-aws-vpc.md",
        "reference/network-architecture.md",
        "reference/system-architecture.md",
        "reference/helm/charts/compute-common-templates-foxtrot-eks.md",
        "reference/subsystems/network-core.md"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 3,
        "precision": 5,
        "navigation": 4,
        "notes": "AI correctly identified both NAT egress models (shared for internal, dedicated per cluster for production). However, it omitted the rationale explaining why this split exists (cost optimization for internal, isolation and capacity management for production). The answer is factually accurate but incomplete without the business/technical reasoning. Navigation appears sound based on source citation, though verification is limited."
      },
      "composite": 85,
      "notFound": false
    },
    {
      "id": "services-ipam-netbox-role",
      "category": "services",
      "difficulty": "medium",
      "question": "What is NetBox used for in Foxtrot, and what infrastructure does it run on?",
      "groundTruth": "NetBox is Foxtrot's IP address management (IPAM) source of truth. It tracks all VPC CIDR blocks across clouds, prevents CIDR overlap, and serves allocation data to compute-tools during VPC and cluster provisioning. It runs on AWS ECS with an RDS database and ALB, provisioned via Terraform in ipam-core.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation shows that NetBox infrastructure files exist within the `ipam-core` subsystem (with Terraform configurations for AWS resources like ALB, ECS, RDS, VPC, etc.), but it does not explicitly state what NetBox is used for in Foxtrot or provide details about its functional purpose within the system.",
      "filesRead": [
        "reference/subsystems/ipam-core.md",
        "reference/system-architecture.md",
        "reference/network-architecture.md",
        "reference/contracts/index.md",
        "reference/helm/index.md"
      ],
      "plannedFiles": [
        "reference/subsystems/ipam-core.md",
        "reference/system-architecture.md",
        "reference/network-architecture.md",
        "reference/contracts/index.md",
        "reference/helm/index.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI agent returned NOT_FOUND, providing no facts about NetBox's purpose or infrastructure. While the response avoids hallucination (precision=5), it fails to locate or extract the ground truth information about NetBox's IPAM role, CIDR management functions, or AWS ECS/RDS/ALB infrastructure. The agent did not successfully navigate to the relevant documentation."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "contracts-argo-gen-params-required",
      "category": "contracts",
      "difficulty": "easy",
      "question": "What are the required fields in an argo-gen-params.yaml file for the ArgoCD ApplicationSet generator?",
      "groundTruth": "chart.version (string), namespace (string), and environment (string).",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation files do not contain information about the required fields in an argo-gen-params.yaml file for the ArgoCD ApplicationSet generator. While the files reference ArgoCD ApplicationSets and the argocd-apps chart, they do not specify the structure or required fields of this configuration file.",
      "filesRead": [
        "reference/helm/charts/app-common-charts-argocd-apps.md",
        "reference/subsystems/app-common.md",
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "reference/system-architecture.md"
      ],
      "plannedFiles": [
        "reference/helm/charts/app-common-charts-argocd-apps.md",
        "reference/subsystems/app-common.md",
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "reference/system-architecture.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI returned NOT_FOUND instead of providing the required fields (chart.version, namespace, environment). While the response avoided hallucination, it failed to locate or provide the ground truth information."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "contracts-azure-xrd-naming",
      "category": "contracts",
      "difficulty": "medium",
      "question": "What is the XRD name format and composition name format for Azure AKS in Foxtrot, with examples?",
      "groundTruth": "XRD name format: `{plural}.{group}`, e.g. `reltioaksclusters.foxtrot.reltio.com`. Composition name format: `reltio-aks-cluster-{channel}-{version}`, e.g. `reltio-aks-cluster-prd-v1`.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation references Azure AKS resources in `compute-common` (specifically `compute-common/compositions/reltio-aks-cluster-kcl/` and `compute-common/templates/foxtrot-aks/`), but does not include the actual content of those files that would specify the XRD name format and composition name format. The system architecture section shows a cluster naming convention (`azr-{phase}-{region-code}-{index}-aks`) but not the XRD or composition name formats.",
      "filesRead": [
        "reference/helm/charts/compute-common-templates-foxtrot-aks.md",
        "reference/helm/charts/network-common-charts-foxtrot-azure-vnet.md",
        "reference/contracts/index.md",
        "reference/subsystems/compute-common.md",
        "reference/system-architecture.md"
      ],
      "plannedFiles": [
        "reference/helm/charts/compute-common-templates-foxtrot-aks.md",
        "reference/helm/charts/network-common-charts-foxtrot-azure-vnet.md",
        "reference/contracts/index.md",
        "reference/subsystems/compute-common.md",
        "reference/system-architecture.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI agent returned NOT_FOUND without locating the specific XRD and composition name formats. While the response avoided hallucination (precision=5), it failed to retrieve the ground truth information: XRD format `{plural}.{group}` (example: `reltioaksclusters.foxtrot.reltio.com`) and composition format `reltio-aks-cluster-{channel}-{version}` (example: `reltio-aks-cluster-prd-v1`). The agent identified relevant file paths but did not access their contents to extract the required specifications."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "contracts-helm-chart-required-values",
      "category": "contracts",
      "difficulty": "medium",
      "question": "What are the five required values that all app Helm charts must define?",
      "groundTruth": "accountID (string), cloudProvider (string), region (string), domain (string), and imageRegistry (string).",
      "llmAnswer": "NOT_FOUND\n\nThe documentation provided does not explicitly state which five values are required across all app Helm charts. While the contracts section shows various values defined for individual app charts (like `ai_app_values`, `auth_app_values`, `cleanse_app_values`, etc.), there is no explicit specification of five universally required values that all app charts must define.",
      "filesRead": [
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/contracts/index.md",
        "reference/helm/charts/app-common-charts-cluster.md",
        "reference/system-architecture.md"
      ],
      "plannedFiles": [
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/contracts/index.md",
        "reference/helm/charts/app-common-charts-cluster.md",
        "reference/system-architecture.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 0,
        "navigation": 0,
        "notes": "Parse error: "
      },
      "composite": 0,
      "notFound": true
    },
    {
      "id": "contracts-sync-wave-ordering",
      "category": "contracts",
      "difficulty": "hard",
      "question": "What are the ArgoCD sync wave values and what resource types are deployed at each wave?",
      "groundTruth": "Wave -5: prerequisites. Wave 0: infrastructure. Wave 5: state apps. Wave 10: application services. Wave 15: post-deployment resources (monitoring, backup hooks, finalizers).",
      "llmAnswer": "Based on the \"Helm Sync Waves (Bootstrapping Order)\" table in reference/helm/index.md:\n\n| Wave | Count | Resources |\n|---|---|---|\n| -1 | 1 | ServiceAccount:pre-sync-app-templates-sa |\n| 0 | 3 | Application:{{, xReltioAKSCluster:{{, ReltioEKSCluster:{{ |\n| 1 | 5 | ClusterRole:pre-sync-app-templates-clusterrole, Application:{{, Workflow:kong-gateway-setup-{{, AccessEntry:{{, Addon:{{ |\n| 2 | 4 | ClusterRoleBinding:pre-sync-app-templates-clusterrolebinding, Workflow:{{, ProviderConfig:{{, ProviderConfig:{{ |\n| 3 | 4 | Workflow:{{, Application:kong-gateway, Release:{{, Release:{{ |\n| 4 | 1 | Release:{{ |\n| 5 | 1 | Job:pre-sync-app-templates-check-job |\n\nThe sync waves establish a bootstrapping order where Wave -1 executes first (ServiceAccount creation), followed by Waves 0-5 in sequence, with Wave 5 being the final validation job.",
      "filesRead": [
        "reference/helm/charts/app-common-charts-argocd-apps.md",
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "diagrams/helm-interactions.mmd",
        "reference/contracts/index.md"
      ],
      "plannedFiles": [
        "reference/helm/charts/app-common-charts-argocd-apps.md",
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "diagrams/helm-interactions.mmd",
        "reference/contracts/index.md"
      ],
      "score": {
        "accuracy": 1,
        "completeness": 1,
        "precision": 1,
        "navigation": 1,
        "notes": "AI answer describes waves -1 through 5 with specific Kubernetes resources, but ground truth specifies waves -5, 0, 5, 10, 15 with logical groupings (prerequisites, infrastructure, state apps, application services, post-deployment). Wave numbers are entirely misaligned. Missing waves 10 and 15 entirely. The detailed resource table appears to reference different documentation or system than ground truth, suggesting either wrong source consulted or hallucinated data. No overlap in wave structure or resource categorization."
      },
      "composite": 20,
      "notFound": false
    }
  ]
}