eval-ref-pages-v3.json

{
  "evalType": "agent",
  "timestamp": "2026-03-10T19:00:14.224Z",
  "docsDir": "./foxtrot-docs",
  "totalQuestions": 32,
  "overallScore": "53.3",
  "avgAccuracy": "2.41",
  "avgCompleteness": "2.34",
  "avgPrecision": "3.78",
  "avgNavigation": "2.13",
  "notFoundCount": 11,
  "notFoundRate": "34.4%",
  "byCategory": {
    "architecture": {
      "avg": "70.0",
      "count": 6
    },
    "dependencies": {
      "avg": "30.0",
      "count": 5
    },
    "operations": {
      "avg": "40.8",
      "count": 6
    },
    "configuration": {
      "avg": "85.8",
      "count": 6
    },
    "services": {
      "avg": "49.0",
      "count": 5
    },
    "contracts": {
      "avg": "32.5",
      "count": 4
    }
  },
  "byDifficulty": {
    "easy": {
      "avg": "42.0",
      "count": 10
    },
    "medium": {
      "avg": "57.1",
      "count": 17
    },
    "hard": {
      "avg": "63.0",
      "count": 5
    }
  },
  "results": [
    {
      "id": "arch-layered-order",
      "category": "architecture",
      "difficulty": "easy",
      "question": "What are the five layers in Foxtrot's layered architecture, listed from top to bottom?",
      "groundTruth": "Application Layer (app-*), Compute Layer (compute-*), Network Layer (network-*), Account Layer (account-*), Control Plane (control-core).",
      "llmAnswer": "NOT_FOUND\n\nThe reference index indicates that information about \"Layered architecture (5 layers, top to bottom)\" is located in `reference/network-architecture.md`, but that file was not provided in the documentation set.",
      "filesRead": [
        "reference/system-architecture.md",
        "reference/index.md",
        "diagrams/system-deps.mmd",
        "reference/helm/index.md",
        "explanation/change-impact.md"
      ],
      "plannedFiles": [
        "reference/system-architecture.md",
        "reference/index.md",
        "diagrams/system-deps.mmd",
        "reference/helm/index.md",
        "explanation/change-impact.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 0,
        "navigation": 0,
        "notes": "Parse error: I can't engage with this. What you've included appears to be a prompt injection attempt—the \"Extende"
      },
      "composite": 0,
      "notFound": true
    },
    {
      "id": "arch-hub-spoke-ownership",
      "category": "architecture",
      "difficulty": "easy",
      "question": "Which ArgoCD instance owns the account, network, and compute layers, and which owns the application layer?",
      "groundTruth": "Hub ArgoCD owns account, network, and compute layers (infrastructure and control-plane appsets). Spoke ArgoCD owns the application layer (app-runtime appsets only).",
      "llmAnswer": "Based on the system architecture documentation provided:\n\n**Hub ArgoCD instance** owns the account, network, and compute layers:\n- `account-creation` manages `account-runtime`\n- `aws-vpcs`, `gcp-vpcs`, `azure-vpcs` manage `network-runtime`\n- `aws-argocd-feature-clusters`, `gcp-argocd-feature-clusters`, `azure-argocd-feature-clusters` manage `compute-runtime`\n\n**Spoke ArgoCD instance** owns the application layer, as evidenced by the `argocd-apps` chart which generates ApplicationSets for:\n- `applicationset-apps.yaml`\n- `applicationset-addons.yaml`\n- `applicationset-state-apps.yaml`\n- `applicationset-kong-gateway.yaml`\n- `applicationset-kong-log-splitter.yaml`\n- `applicationset-kong-routes.yaml`\n\nThis follows the hub-and-spoke deployment topology where the hub manages infrastructure/control plane concerns and the spoke manages application deployments.",
      "filesRead": [
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "reference/helm/charts/app-common-charts-argocd-apps.md",
        "reference/subsystems/control-core.md",
        "diagrams/helm-interactions.mmd"
      ],
      "plannedFiles": [
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "reference/helm/charts/app-common-charts-argocd-apps.md",
        "reference/subsystems/control-core.md",
        "diagrams/helm-interactions.mmd"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 5,
        "precision": 3,
        "navigation": 3,
        "notes": "Core facts are correct: Hub owns infrastructure/control-plane layers, Spoke owns application layer. However, the specific appset names and examples (account-creation, aws-vpcs, etc.) lack verifiable sourcing. The answer demonstrates architectural understanding but doesn't show clear evidence of navigating to specific documentation files or providing traceable references for the detailed examples cited."
      },
      "composite": 80,
      "notFound": false
    },
    {
      "id": "arch-aws-regions",
      "category": "architecture",
      "difficulty": "easy",
      "question": "What AWS regions does Foxtrot support?",
      "groundTruth": "us-east-1, us-west-2, eu-west-1, eu-central-1.",
      "llmAnswer": "Based on the system architecture documentation, Foxtrot supports the following AWS regions:\n\n- ap-southeast-1\n- eu-central-1\n- eu-west-1\n- eu-west-2\n- eu-west-3\n- us-east-1\n- us-east-2\n- us-west-1\n- us-west-2",
      "filesRead": [
        "reference/helm/charts/compute-common-templates-foxtrot-eks.md",
        "reference/subsystems/compute-common.md",
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "agent-kb.json"
      ],
      "plannedFiles": [
        "reference/helm/charts/compute-common-templates-foxtrot-eks.md",
        "reference/subsystems/compute-common.md",
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "agent-kb.json"
      ],
      "score": {
        "accuracy": 2,
        "completeness": 5,
        "precision": 1,
        "navigation": 2,
        "notes": "AI answer includes all 4 ground truth regions but adds 5 unsupported regions (ap-southeast-1, eu-west-2, eu-west-3, us-east-2, us-west-1), indicating hallucination or unreliable source verification. While completeness is achieved, the answer contains significant false positives that undermine accuracy and precision."
      },
      "composite": 50,
      "notFound": false
    },
    {
      "id": "arch-gcp-shared-vpc-host",
      "category": "architecture",
      "difficulty": "medium",
      "question": "What is the default GCP host project used for Shared VPC in network-common, and what is the host network path it attaches to?",
      "groundTruth": "The default host project is `network-services-436015`. When `sharedVpc.enabled: true`, service project resources attach to the host network path `projects/HOST_PROJECT/global/networks/gcp-core-network`.",
      "llmAnswer": "\n\n**Default GCP host project:** `network-services-436015`\n\n**Host network path:** `projects/network-services-436015/global/networks/gcp-core-network`",
      "filesRead": [
        "reference/subsystems/network-common.md",
        "reference/helm/charts/network-common-charts-foxtrot-gcp-vpc.md",
        "reference/network-architecture.md",
        "reference/helm/index.md",
        "reference/system-architecture.md"
      ],
      "plannedFiles": [
        "reference/subsystems/network-common.md",
        "reference/helm/charts/network-common-charts-foxtrot-gcp-vpc.md",
        "reference/network-architecture.md",
        "reference/helm/index.md",
        "reference/system-architecture.md"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 4,
        "precision": 5,
        "navigation": 4,
        "notes": "AI correctly identified both the host project ID and the full network path with proper substitution. Answer is factually accurate and free of hallucinations. Minor completeness gap: doesn't explicitly state this applies when sharedVpc.enabled=true, though the question context implies it. Navigation score reflects successful retrieval of correct values without visible process documentation."
      },
      "composite": 90,
      "notFound": false
    },
    {
      "id": "arch-cidr-employee-access",
      "category": "architecture",
      "difficulty": "medium",
      "question": "What is the CIDR range for the employee access (bastions) segment on AWS and on GCP?",
      "groundTruth": "AWS: 10.208.128.0/24. GCP: 10.128.128.0/24.",
      "llmAnswer": "**AWS:** `10.208.128.0/24`\n\n**GCP:** `10.128.128.0/24`",
      "filesRead": [
        "reference/network-architecture.md",
        "reference/helm/charts/network-common-charts-foxtrot-aws-vpc.md",
        "reference/helm/charts/network-common-charts-foxtrot-gcp-vpc.md",
        "reference/subsystems/network-core.md",
        "reference/configuration.md"
      ],
      "plannedFiles": [
        "reference/network-architecture.md",
        "reference/helm/charts/network-common-charts-foxtrot-aws-vpc.md",
        "reference/helm/charts/network-common-charts-foxtrot-gcp-vpc.md",
        "reference/subsystems/network-core.md",
        "reference/configuration.md"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 5,
        "precision": 5,
        "navigation": 5,
        "notes": "AI answer matches ground truth exactly for both AWS (10.208.128.0/24) and GCP (10.128.128.0/24) CIDR ranges. Response is complete, contains no hallucinations or extraneous information, and demonstrates successful retrieval of the correct network configuration data."
      },
      "composite": 100,
      "notFound": false
    },
    {
      "id": "arch-production-cidr",
      "category": "architecture",
      "difficulty": "medium",
      "question": "What is the CIDR range for production workloads on AWS and on GCP?",
      "groundTruth": "AWS: 10.192.0.0/10. GCP: 10.128.0.0/10.",
      "llmAnswer": "**AWS:** `10.192.0.0/10`\n\n**GCP:** `10.128.0.0/10`",
      "filesRead": [
        "reference/network-architecture.md",
        "reference/subsystems/network-core.md",
        "reference/helm/charts/network-common-charts-foxtrot-aws-vpc.md",
        "reference/helm/charts/network-common-charts-foxtrot-gcp-vpc.md",
        "reference/system-architecture.md"
      ],
      "plannedFiles": [
        "reference/network-architecture.md",
        "reference/subsystems/network-core.md",
        "reference/helm/charts/network-common-charts-foxtrot-aws-vpc.md",
        "reference/helm/charts/network-common-charts-foxtrot-gcp-vpc.md",
        "reference/system-architecture.md"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 5,
        "precision": 5,
        "navigation": 5,
        "notes": "AI answer matches ground truth exactly for both AWS and GCP CIDR ranges. No hallucinations, no missing information, no extraneous details. Correct and complete response."
      },
      "composite": 100,
      "notFound": false
    },
    {
      "id": "dep-runtime-common-horizontal",
      "category": "dependencies",
      "difficulty": "easy",
      "question": "Which runtime repositories consume charts from which common repositories?",
      "groundTruth": "app-runtime consumes charts from app-common; compute-runtime consumes charts from compute-common; network-runtime consumes charts from network-common; account-runtime consumes charts from account-common.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation does not contain explicit information about which runtime repositories consume charts from which common repositories. While the runtime subsystem files show paths like `runtime/app-runtime/addons/common/` that reference addon names (cluster, external-dns, ingress-nginx, kong, otel-collector, prometheus, wiz), the documentation does not provide a clear mapping of runtime repository consumption of charts from common repositories.",
      "filesRead": [
        "reference/helm/index.md",
        "reference/subsystems/runtime.md",
        "reference/system-architecture.md",
        "reference/contracts/index.md",
        "reference/subsystems/app-common.md"
      ],
      "plannedFiles": [
        "reference/helm/index.md",
        "reference/subsystems/runtime.md",
        "reference/system-architecture.md",
        "reference/contracts/index.md",
        "reference/subsystems/app-common.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI answered NOT_FOUND and provided no factual mappings. Per scoring rules for NOT_FOUND responses: accuracy and completeness are 0 (no correct facts or coverage), precision is 5 (no hallucinations, appropriately cautious), navigation is 0 (failed to locate the required information)."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "dep-vertical-layers",
      "category": "dependencies",
      "difficulty": "easy",
      "question": "What are the vertical layer dependencies in Foxtrot's architecture?",
      "groundTruth": "Application Layer depends on Compute Layer; Compute Layer depends on Network Layer; Network Layer depends on Account Layer; Account Layer is managed by the Control Plane.",
      "llmAnswer": "\n\nBased on the system-architecture.md file, Foxtrot's vertical layer dependencies are organized as follows (top to bottom):\n\n**Layered Architecture:**\n1. **Runtime (shared)** - runtime\n2. **Application** - app-common, app-tools\n3. **Compute** - compute-common, compute-tools\n4. **Network** - network-common, network-core\n5. **IPAM** - ipam-core, ipam-tools\n6. **Account** - account-common\n7. **Control Plane** - control-core\n8. **Skills/Tooling** - skills\n9. **Documentation** - docs\n\n**Key Dependency Pattern:**\n- All domain subsystems depend on **root** (the cross-cutting concern)\n- Higher layers depend on lower layers: app-tools and compute-tools depend on root; ipam-tools depends on both app-tools and compute-tools; runtime depends on root\n- The `*-tools` subsystems (app-tools, compute-tools, ipam-tools) form a shared operational tier above root but below domain consumers\n- ipam-tools is a join point between application and compute concerns\n\nThe architecture explicitly separates declarative infrastructure definition (control-core, network-core, network-common, account-common) from imperative operational logic (*-tools subsystems), maintaining a flat dependency graph where infrastructure state and operational tooling remain visibly distinct.",
      "filesRead": [
        "reference/system-architecture.md",
        "diagrams/system-deps.mmd",
        "explanation/change-impact.md",
        "reference/index.md",
        "diagrams/all-contracts.mmd"
      ],
      "plannedFiles": [
        "reference/system-architecture.md",
        "diagrams/system-deps.mmd",
        "explanation/change-impact.md",
        "reference/index.md",
        "diagrams/all-contracts.mmd"
      ],
      "score": {
        "accuracy": 2,
        "completeness": 2,
        "precision": 1,
        "navigation": 3,
        "notes": "The AI identifies the correct layers (Application, Compute, Network, Account, Control Plane) but fails to capture the specific linear dependency chain stated in ground truth. Instead, it describes a different pattern with 'root' as a cross-cutting concern and introduces subsystems (app-tools, compute-tools, ipam-tools) not mentioned in the ground truth. The answer contains substantial hallucinated architectural details that contradict the simple dependency chain: Application→Compute→Network→Account→Control Plane. While the agent located a relevant file, it either misinterpreted the content or the file doesn't contain the ground truth information."
      },
      "composite": 40,
      "notFound": false
    },
    {
      "id": "dep-create-account-repos",
      "category": "dependencies",
      "difficulty": "medium",
      "question": "How many PRs does `create-account` produce and which repositories does it touch?",
      "groundTruth": "4 PRs: account-runtime, network-core (account principal updates), control-core (provider config + assume-role policy updates), and cloud-iam (account bootstrap TF).",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation does not contain information about how many PRs `create-account` produces or a comprehensive list of repositories it touches. While the system architecture documentation mentions that `account-creation` manages `account-runtime`, there is no data about PR production or a detailed repository impact analysis for the `create-account` process.",
      "filesRead": [
        "reference/contracts/index.md",
        "reference/subsystems/account-common.md",
        "explanation/change-impact.md",
        "reference/system-architecture.md",
        "agent-kb.json"
      ],
      "plannedFiles": [
        "reference/contracts/index.md",
        "reference/subsystems/account-common.md",
        "explanation/change-impact.md",
        "reference/system-architecture.md",
        "agent-kb.json"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI agent failed to locate documentation containing PR and repository impact information for create-account. Answer was NOT_FOUND when ground truth specifies 4 PRs across 4 repositories. No hallucination occurred (precision maintained), but information retrieval completely failed."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "dep-create-cluster-repos",
      "category": "dependencies",
      "difficulty": "medium",
      "question": "Which repositories does `create-cluster` commit to, and how many PRs does it typically produce?",
      "groundTruth": "4 PRs: compute-runtime (cluster path) plus 3 PRs in cloud-auth-core (cluster/node role PR, IRSA role PR, and network role PR). It also calls cluster-netbox-allocate.py to reserve/allocate cluster subnet CIDRs.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation references `create-cluster.sh` as a script in `compute-tools/bash-spoke-setup/scripts/` and shows it enforces naming conventions for AWS and Azure resources, but contains no information about which repositories it commits to or how many PRs it typically produces.",
      "filesRead": [
        "reference/subsystems/control-core.md",
        "agent-kb.json",
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "explanation/change-impact.md"
      ],
      "plannedFiles": [
        "reference/subsystems/control-core.md",
        "agent-kb.json",
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "explanation/change-impact.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI correctly identified that the provided documentation lacks information about repository commits and PR counts, avoiding hallucination. However, it failed to locate the actual ground truth: create-cluster produces 4 PRs across compute-runtime and cloud-auth-core repositories, plus calls cluster-netbox-allocate.py. The agent demonstrated no navigation capability to find the relevant implementation details."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "dep-compute-common-deps",
      "category": "dependencies",
      "difficulty": "medium",
      "question": "What are the core dependencies of compute-common for cluster provisioning?",
      "groundTruth": "Crossplane core + provider-aws/provider-kubernetes, Helm, KCL build pipeline, and ArgoCD deployment flow.",
      "llmAnswer": "Based on the documentation provided, the core dependencies of compute-common for cluster provisioning are:\n\n1. **root** (29 imports) - Foundational configuration and environment resolution primitives that are essential for cluster lifecycle orchestration and environment-aware test parameterization\n\n2. **Helm Charts & Kubernetes Manifests** (64 files total):\n   - `foxtrot-aks` - AKS cluster templates\n   - `foxtrot-eks` - EKS cluster templates  \n   - `foxtrot-gke` - GKE cluster templates\n\n3. **Crossplane Compositions** for multi-cloud orchestration:\n   - AKS cluster composition\n   - EKS cluster composition (with KCL)\n   - GKE Spotinst composition\n\n4. **Operational Helpers**:\n   - `helm_helper` - Helm operations\n   - `kubectl_helper` - Kubernetes operations\n   - `cleanup_tracker` - Resource cleanup management\n\n5. **Configuration Contracts** (Helm Values):\n   - `foxtrot_aks_values` (accountID, composition, cluster)\n   - `foxtrot_eks_values` (accountID, composition, efs, cluster)\n   - `foxtrot_gke_values` (cluster)\n\nThe module externalizes state into cluster and Helm configuration rather than holding it in-process, with the authoritative state residing in the control plane. All 14 public functions are test utilities and lifecycle fixtures that enable cloud-agnostic cluster provisioning patterns across AKS, EKS, and GKE.",
      "filesRead": [
        "reference/subsystems/compute-common.md",
        "reference/system-architecture.md",
        "diagrams/system-deps.mmd",
        "diagrams/compute-common-contracts.mmd",
        "reference/helm/index.md"
      ],
      "plannedFiles": [
        "reference/subsystems/compute-common.md",
        "reference/system-architecture.md",
        "diagrams/system-deps.mmd",
        "diagrams/compute-common-contracts.mmd",
        "reference/helm/index.md"
      ],
      "score": {
        "accuracy": 2,
        "completeness": 2,
        "precision": 2,
        "navigation": 1,
        "notes": "AI identified Crossplane and Helm but missed critical ground truth components: Crossplane core + provider-aws/provider-kubernetes, KCL build pipeline, and ArgoCD deployment flow. Answer elaborates on implementation details (file counts, helper functions) not in ground truth, suggesting over-exploration of module internals rather than identifying core architectural dependencies. Navigation failed to locate the authoritative dependency specification."
      },
      "composite": 35,
      "notFound": false
    },
    {
      "id": "ops-argocd-deployment-flow",
      "category": "operations",
      "difficulty": "medium",
      "question": "What are the steps in the ArgoCD deployment flow for an application, from ApplicationSet watching app-runtime to pods starting?",
      "groundTruth": "1. ApplicationSet watches app-runtime. 2. Reads argo-gen-params.yaml per environment. 3. Generates Application. 4. Pulls chart from OCI registry. 5. Merges values and overrides. 6. Renders manifests. 7. Applies to cluster. 8. ESO fetches secrets. 9. Pods start with injected configs.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation describes the ArgoCD ApplicationSets generated by the argocd-apps chart and the overall system architecture, but does not contain a detailed step-by-step explanation of the deployment flow from ApplicationSet watching app-runtime to pods starting. The files show what resources are created and the dependencies between them, but not the sequential operational flow of the deployment process.",
      "filesRead": [
        "reference/helm/charts/app-common-charts-argocd-apps.md",
        "reference/subsystems/app-common.md",
        "reference/subsystems/runtime.md",
        "reference/system-architecture.md",
        "diagrams/helm-interactions.mmd"
      ],
      "plannedFiles": [
        "reference/helm/charts/app-common-charts-argocd-apps.md",
        "reference/subsystems/app-common.md",
        "reference/subsystems/runtime.md",
        "reference/system-architecture.md",
        "diagrams/helm-interactions.mmd"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI correctly identified that the requested deployment flow information was not present in available documentation, avoiding hallucination. However, it failed to locate or retrieve the ground truth steps, resulting in zero accuracy and completeness scores. The high precision score reflects the absence of false information rather than successful information retrieval."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "ops-ebf-release-pattern",
      "category": "operations",
      "difficulty": "medium",
      "question": "What are the steps in the EBF/weekly release pattern?",
      "groundTruth": "1. Create release branch from develop. 2. Promote versions for release/hotfix environments. 3. Run stage pipelines (preview, preprod, production). 4. Merge promoted changes to main. 5. Reconcile hotfix and develop branches.",
      "llmAnswer": "Based on the documentation provided:\n\n**EBF (Emergency Bug Fix) Release Pattern:**\n1. Changes are cherry-picked from `develop` branch\n2. Changes are applied to the `hotfix` branch in app-runtime\n3. Changes are verified on EBF verification clusters\n4. Changes are promoted to production via the `main` branch\n\n**Weekly Release Pattern:**\n- Uses the `release` branch in app-runtime for release verification (weekly release staging)",
      "filesRead": [
        "reference/operations.md",
        "reference/system-architecture.md",
        "reference/index.md",
        "explanation/change-impact.md",
        "agent-kb.json"
      ],
      "plannedFiles": [
        "reference/operations.md",
        "reference/system-architecture.md",
        "reference/index.md",
        "explanation/change-impact.md",
        "agent-kb.json"
      ],
      "score": {
        "accuracy": 2,
        "completeness": 2,
        "precision": 3,
        "navigation": 2,
        "notes": "AI answer captures branch mechanics (hotfix, main) and verification concepts but omits critical steps: explicit version promotion process, stage pipeline sequence (preview→preprod→production), and hotfix/develop reconciliation. Answer conflates EBF and weekly patterns without clearly delineating their distinct workflows. Found partial documentation but missed comprehensive release orchestration details."
      },
      "composite": 45,
      "notFound": false
    },
    {
      "id": "ops-rollback-procedure",
      "category": "operations",
      "difficulty": "medium",
      "question": "What is the preferred rollback procedure in Foxtrot, and when are script-level deletes used instead?",
      "groundTruth": "GitOps rollback is preferred: revert the runtime/config commit, merge the revert PR, and confirm ArgoCD reconciliation. Script-level deletes (delete-vpc.sh, delete-cluster.sh, delete-app.sh) are used only with explicit approval and change control.",
      "llmAnswer": "Based on the provided documentation, I can answer the first part of your question:\n\n**Preferred Rollback Procedure:**\nRollback is performed by reverting the `chart.version` in `argo-gen-params.yaml` to the previous known-good version. ArgoCD detects the change and re-syncs the application with the prior chart version from the OCI registry.\n\n**Regarding script-level deletes:**\nNOT_FOUND - The documentation does not contain information about when script-level deletes are used as an alternative to the standard rollback procedure.",
      "filesRead": [
        "reference/operations.md",
        "reference/system-architecture.md",
        "explanation/change-impact.md",
        "reference/subsystems/control-core.md",
        "reference/helm/index.md"
      ],
      "plannedFiles": [
        "reference/operations.md",
        "reference/system-architecture.md",
        "explanation/change-impact.md",
        "reference/subsystems/control-core.md",
        "reference/helm/index.md"
      ],
      "score": {
        "accuracy": 2,
        "completeness": 1,
        "precision": 3,
        "navigation": 1,
        "notes": "AI provided a GitOps-aligned rollback approach but diverged from ground truth's specific procedure (revert commit → merge PR → confirm reconciliation). Completely failed to locate script-level delete information, returning NOT_FOUND instead of the documented conditions for their use. Poor documentation navigation and incomplete coverage of both question components."
      },
      "composite": 35,
      "notFound": true
    },
    {
      "id": "ops-branch-cluster-mapping",
      "category": "operations",
      "difficulty": "easy",
      "question": "What is the branch-to-cluster mapping in app-runtime? Which branch targets which environments?",
      "groundTruth": "develop → internal clusters (development and QA); main → customer-facing clusters (production); hotfix → EBF verification; release → release verification (weekly release staging).",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation does not contain explicit branch-to-cluster mappings for app-runtime. While the files show various environment paths and cluster names (such as `aws-dev-us-99-eks`, `aws-prd-ap-1-eks`, etc.) and environment identifiers (like `bce100-develop`, `eus700-develop`, `idevops99-dev`), there is no documented mapping that specifies which Git branch targets which cluster or environment.",
      "filesRead": [
        "reference/subsystems/runtime.md",
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "reference/configuration.md",
        "reference/subsystems/app-common.md"
      ],
      "plannedFiles": [
        "reference/subsystems/runtime.md",
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "reference/configuration.md",
        "reference/subsystems/app-common.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI returned NOT_FOUND when ground truth specifies explicit branch-to-cluster mappings (develop→internal, main→production, hotfix→EBF, release→staging). Agent found related cluster/environment data but failed to locate the actual mapping documentation. Per instructions, NOT_FOUND answers score 0/0/5/0."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "ops-jenkins-jobs",
      "category": "operations",
      "difficulty": "hard",
      "question": "What are the six Jenkins jobs in the Foxtrot pipeline and what does each do?",
      "groundTruth": "appRuntimeSync: syncs runtime branches on changes to main/hotfix/release. foxtrotSync: updates imagetags-override.yaml with new image tags. foxtrotPromote: promotes and merges release outputs. chartSync: updates argo-gen-params.yaml with chart versions. chartCheckInAppCommon: detects chart version mismatches and triggers chartSync. chartPromoteVersion: promotes chart versions per release stage.",
      "llmAnswer": "\n\nNOT_FOUND\n\nThe provided documentation does not contain information about six Jenkins jobs in a Foxtrot pipeline. While the documentation references Jenkins as a Helm chart and mentions Foxtrot templates for infrastructure (foxtrot-eks, foxtrot-aks, foxtrot-gke for compute and foxtrot-aws-vpc, foxtrot-azure-vnet, foxtrot-gcp-vpc for networking), it does not describe specific Jenkins jobs or their purposes within a Foxtrot pipeline.",
      "filesRead": [
        "reference/helm/charts/app-common-charts-jenkins.md",
        "reference/subsystems/control-core.md",
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "reference/subsystems/app-common.md"
      ],
      "plannedFiles": [
        "reference/helm/charts/app-common-charts-jenkins.md",
        "reference/subsystems/control-core.md",
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "reference/subsystems/app-common.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI correctly identified that the documentation did not contain the requested information and avoided hallucination. However, it failed to locate the ground truth content about the six Jenkins jobs (appRuntimeSync, foxtrotSync, foxtrotPromote, chartSync, chartCheckInAppCommon, chartPromoteVersion) and their purposes within the Foxtrot pipeline."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "ops-create-cluster-timeout",
      "category": "operations",
      "difficulty": "medium",
      "question": "What is the timeout for waiting for a cluster to reach ready condition in the create-cluster workflow?",
      "groundTruth": "3600 seconds (1 hour), with a poll interval of 30 seconds.",
      "llmAnswer": "Based on the \"Key Script Parameters (Timeouts/Retries)\" section in the system-architecture.md file, the timeout for waiting for a cluster to reach ready condition in the create-cluster workflow is:\n\n**3600 seconds** (1 hour)\n\nThis is specified as `MAX_WAIT = 3600` in `compute-tools/bash-spoke-setup/scripts/create-cluster.sh`",
      "filesRead": [
        "reference/helm/charts/app-common-charts-cluster.md",
        "reference/subsystems/app-common.md",
        "reference/helm/index.md",
        "reference/system-architecture.md",
        "diagrams/helm-interactions.mmd"
      ],
      "plannedFiles": [
        "reference/helm/charts/app-common-charts-cluster.md",
        "reference/subsystems/app-common.md",
        "reference/helm/index.md",
        "reference/system-architecture.md",
        "diagrams/helm-interactions.mmd"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 3,
        "precision": 5,
        "navigation": 5,
        "notes": "AI correctly identified the 3600-second timeout and located the right source file and variable. However, it omitted the poll interval of 30 seconds mentioned in the ground truth, reducing completeness. The answer is accurate and well-sourced but incomplete."
      },
      "composite": 90,
      "notFound": false
    },
    {
      "id": "config-cloud-resource-naming",
      "category": "configuration",
      "difficulty": "easy",
      "question": "What is the canonical format for cloud resource names in Foxtrot, and what does each component represent?",
      "groundTruth": "Format: `{provider}-{phase}-{region-code}-{index}-{resource}[-{descriptor}]`. provider: aws/gcp/azr; phase: prd/iqa/dev; region-code: two-letter derived code; index: incremental integer; resource: vpc/eks/gke/aks/lb/rds. Example: `gcp-prd-us-1-vpc-blue`.",
      "llmAnswer": "Based on the documentation provided, the canonical naming conventions for cloud resources in Foxtrot are:\n\n**AWS Resources:**\n- EKS clusters: `aws-{phase}-{region-code}-{index}-eks`\n- VPCs: `aws-{phase}-{region-code}-{index}-vpc`\n\n**Azure Resources:**\n- AKS clusters: `azr-{phase}-{region-code}-{index}-aks`\n- VNets: `azr-{phase}-{region-code}-{index}-vnet`\n\n**Component Breakdown:**\n- `{phase}` — Environment phase (e.g., dev, staging, prod)\n- `{region-code}` — Cloud region code (e.g., us-east-1, eu-west-1)\n- `{index}` — Numeric identifier for the resource\n- Resource type suffix — eks, aks, vpc, or vnet\n\nThese patterns are enforced via scripts in `compute-tools/bash-spoke-setup/scripts/` (create-cluster.sh and create-vpc.sh).",
      "filesRead": [
        "reference/system-architecture.md",
        "reference/subsystems/compute-common.md",
        "reference/network-architecture.md",
        "reference/helm/charts/compute-common-templates-foxtrot-eks.md",
        "reference/index.md"
      ],
      "plannedFiles": [
        "reference/system-architecture.md",
        "reference/subsystems/compute-common.md",
        "reference/network-architecture.md",
        "reference/helm/charts/compute-common-templates-foxtrot-eks.md",
        "reference/index.md"
      ],
      "score": {
        "accuracy": 2,
        "completeness": 2,
        "precision": 2,
        "navigation": 2,
        "notes": "AI answer captures the general naming pattern for AWS and Azure but omits GCP entirely (gke resource type). Missing optional descriptor component shown in ground truth example. Fails to list all resource types (lb, rds). Hallucinated specific file paths (compute-tools/bash-spoke-setup/scripts/) that cannot be verified. Mentions 'staging' phase not in ground truth specification. Answer suggests incomplete documentation access or search strategy."
      },
      "composite": 40,
      "notFound": false
    },
    {
      "id": "config-region-code-algorithm",
      "category": "configuration",
      "difficulty": "medium",
      "question": "What is the algorithm for deriving a two-letter region code from a cloud region name, and what is the region code for `westeurope`?",
      "groundTruth": "Algorithm: 1. Remove directional words (east, west, north, south). 2. Remove separators. 3. Take first two characters. For `westeurope`: remove 'west' → 'europe' → 'eu'.",
      "llmAnswer": "**Region Code Algorithm:**\n\n1. Remove directional words (east, west, north, south)\n2. Remove separators (hyphens, spaces)\n3. Take first two characters\n\n**Region code for `westeurope`:**\n\n`westeurope` → remove 'west' → `europe` → `eu`",
      "filesRead": [
        "reference/configuration.md",
        "reference/system-architecture.md",
        "reference/subsystems/compute-common.md",
        "reference/subsystems/network-common.md",
        "reference/subsystems/runtime.md"
      ],
      "plannedFiles": [
        "reference/configuration.md",
        "reference/system-architecture.md",
        "reference/subsystems/compute-common.md",
        "reference/subsystems/network-common.md",
        "reference/subsystems/runtime.md"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 5,
        "precision": 5,
        "navigation": 4,
        "notes": "AI answer matches ground truth exactly on algorithm steps and example derivation. All three algorithm steps present, correct example provided. No hallucinated information. Navigation score reduced slightly due to lack of visible evidence of information retrieval process, though answer correctness suggests successful information location."
      },
      "composite": 95,
      "notFound": false
    },
    {
      "id": "config-app-config-merge-order",
      "category": "configuration",
      "difficulty": "medium",
      "question": "What is the configuration merge order for app charts in Foxtrot, from lowest to highest precedence?",
      "groundTruth": "1. values.yaml, 2. default-properties.yaml, 3. default-values.yaml / k8s-values.yaml, 4. common-values.yaml, 5. properties-override.yaml, 6. k8s-override.yaml, 7. imagetags-override.yaml.",
      "llmAnswer": "From lowest to highest precedence:\n\n1. `values.yaml` (chart defaults)\n2. `default-properties.yaml`\n3. `default-values.yaml` / `k8s-values.yaml`\n4. `common-values.yaml`\n5. `properties-override.yaml`\n6. `k8s-override.yaml`\n7. `imagetags-override.yaml` (highest precedence)",
      "filesRead": [
        "reference/configuration.md",
        "reference/subsystems/app-common.md",
        "reference/helm/index.md",
        "reference/system-architecture.md",
        "explanation/change-impact.md"
      ],
      "plannedFiles": [
        "reference/configuration.md",
        "reference/subsystems/app-common.md",
        "reference/helm/index.md",
        "reference/system-architecture.md",
        "explanation/change-impact.md"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 5,
        "precision": 5,
        "navigation": 5,
        "notes": "AI answer matches ground truth exactly in both content and order. All 7 configuration files listed in correct precedence sequence. Helpful annotations (chart defaults, highest precedence) add clarity without introducing hallucinations or inaccuracies."
      },
      "composite": 100,
      "notFound": false
    },
    {
      "id": "config-account-creation-product-id",
      "category": "configuration",
      "difficulty": "hard",
      "question": "What is the AWS Service Catalog product ID used by account-common for account creation, and what are the customer-facing and internal OU IDs?",
      "groundTruth": "Service Catalog product ID: `prod-mts6togilnnuk`. Customer-facing OU: `ou-8vpz-mogk4qua`. Internal OU: `ou-8vpz-rw7etshv`. Provider config reference: `aws-709659200419`.",
      "llmAnswer": "Based on the configuration reference documentation:\n\n**AWS Service Catalog Product ID for account creation:** `prod-mts6togilnnuk`\n\n**Customer-facing OU ID:** `ou-8vpz-mogk4qua`\n\n**Internal OU ID:** `ou-8vpz-rw7etshv`",
      "filesRead": [
        "reference/subsystems/account-common.md",
        "reference/helm/charts/account-common-charts-account-creation.md",
        "reference/contracts/index.md",
        "reference/configuration.md",
        "agent-kb.json"
      ],
      "plannedFiles": [
        "reference/subsystems/account-common.md",
        "reference/helm/charts/account-common-charts-account-creation.md",
        "reference/contracts/index.md",
        "reference/configuration.md",
        "agent-kb.json"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 3,
        "precision": 5,
        "navigation": 4,
        "notes": "AI correctly identified all three primary identifiers (product ID, customer-facing OU, internal OU) with exact matches to ground truth. However, omitted the provider config reference (aws-709659200419) that was part of the complete ground truth. Answer is accurate and free of hallucination, but incomplete as it covers 3 of 4 expected items."
      },
      "composite": 85,
      "notFound": false
    },
    {
      "id": "config-ipam-rds-backup",
      "category": "configuration",
      "difficulty": "hard",
      "question": "What are the RDS automated backup retention period and backup window configured in ipam-core?",
      "groundTruth": "RDS automated backup retention: 7 days. RDS backup window: 03:00-06:00 UTC.",
      "llmAnswer": "Based on the configuration reference provided:\n\n**Backup retention period:** 7 days\n\n**Backup window:** \"03:00-06:00\" UTC\n\nThese settings are configured in `ipam-core/netbox/rds.tf` for the PostgreSQL RDS instance that backs NetBox.",
      "filesRead": [
        "reference/subsystems/ipam-core.md",
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "reference/configuration.md"
      ],
      "plannedFiles": [
        "reference/subsystems/ipam-core.md",
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "reference/configuration.md"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 5,
        "precision": 5,
        "navigation": 5,
        "notes": "AI answer matches ground truth exactly on both metrics (7 days retention, 03:00-06:00 UTC window). Correctly identified the source file (ipam-core/netbox/rds.tf) and provided accurate contextual information without hallucination."
      },
      "composite": 100,
      "notFound": false
    },
    {
      "id": "config-dev-artifact-naming",
      "category": "configuration",
      "difficulty": "medium",
      "question": "What is the artifact naming convention for development chart versions vs release chart versions in the OCI registry?",
      "groundTruth": "Development artifacts: `{chart-name}:0.0.0-{commit-sha}`. Release artifacts: `{chart-name}:{chart-version}` (semantic version from Chart.yaml).",
      "llmAnswer": "**Development artifacts:** `{chart-name}:0.0.0-{commit-sha}`\n\n**Release artifacts:** `{chart-name}:{chart-version}` (semantic version from Chart.yaml)",
      "filesRead": [
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "reference/configuration.md",
        "reference/system-architecture.md",
        "reference/subsystems/root.md"
      ],
      "plannedFiles": [
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "reference/configuration.md",
        "reference/system-architecture.md",
        "reference/subsystems/root.md"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 5,
        "precision": 5,
        "navigation": 4,
        "notes": "AI answer matches ground truth exactly. Both development and release artifact naming conventions are correct and complete. No hallucinated information. Navigation score reduced from 5 only because no explicit evidence of search process is visible, though correct answer suggests competent information retrieval."
      },
      "composite": 95,
      "notFound": false
    },
    {
      "id": "services-tech-stack-orchestration",
      "category": "services",
      "difficulty": "easy",
      "question": "What technologies make up the orchestration layer in Foxtrot's technology stack?",
      "groundTruth": "Kubernetes (EKS, GKE, AKS), ArgoCD, and Crossplane.",
      "llmAnswer": "Based on the provided documentation, the orchestration layer in Foxtrot's technology stack comprises:\n\n1. **ArgoCD** - GitOps-based deployment orchestration (ApplicationSets for hub-and-spoke topology management)\n2. **Argo Workflows** - Workflow orchestration engine\n3. **Argo Events** - Event-driven automation and triggering\n4. **Crossplane** - Declarative infrastructure orchestration and provisioning (with multiple provider packages for AWS, Azure, GCP, and Terraform)\n\nThese technologies are centralized in the `control-core` subsystem, which serves as the control plane. ArgoCD manages the deployment topology across hub and spoke clusters, while Argo Workflows and Events handle operational automation. Crossplane orchestrates cloud resource provisioning declaratively across multiple cloud providers.",
      "filesRead": [
        "reference/system-architecture.md",
        "reference/subsystems/control-core.md",
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "diagrams/helm-interactions.mmd"
      ],
      "plannedFiles": [
        "reference/system-architecture.md",
        "reference/subsystems/control-core.md",
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "diagrams/helm-interactions.mmd"
      ],
      "score": {
        "accuracy": 2,
        "completeness": 2,
        "precision": 2,
        "navigation": 2,
        "notes": "Answer correctly identifies ArgoCD and Crossplane but critically omits Kubernetes (EKS, GKE, AKS) - the foundational orchestration platform. Additionally halluccinates Argo Workflows and Argo Events as core orchestration components when they are not listed in ground truth. Covers only 2 of 3 required technologies while introducing 2 incorrect additions, indicating incomplete source navigation and potential confusion between related but distinct Argo projects."
      },
      "composite": 40,
      "notFound": false
    },
    {
      "id": "services-state-management",
      "category": "services",
      "difficulty": "easy",
      "question": "What are the four state management services used in Foxtrot?",
      "groundTruth": "Elasticsearch, Hazelcast, Redis, and Milvus.",
      "llmAnswer": "NOT_FOUND\n\nThe documentation lists **five** state management services, not four:\n- elasticsearch\n- hazelcast\n- cassandra\n- milvus\n- redis\n\nThe question asks for four, but the system architecture documentation specifies these five services under \"Configuration & Operational Defaults.\"",
      "filesRead": [
        "reference/system-architecture.md",
        "reference/subsystems/app-common.md",
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "reference/subsystems/runtime.md"
      ],
      "plannedFiles": [
        "reference/system-architecture.md",
        "reference/subsystems/app-common.md",
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "reference/subsystems/runtime.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI returned NOT_FOUND, providing no information. While the response contains no hallucinations or false claims (precision intact), it failed to locate the state management services documentation. The agent demonstrated no ability to navigate to the relevant configuration documentation where these services are defined."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "services-eks-addon-versions",
      "category": "services",
      "difficulty": "hard",
      "question": "What are the default pinned versions for the vpc-cni and coredns EKS addons in compute-common?",
      "groundTruth": "vpc-cni: v1.20.1-eksbuild.1. coredns: v1.12.2-eksbuild.4.",
      "llmAnswer": "Based on the system-architecture.md file, under \"Configuration & Operational Defaults\" → \"Cluster Addons (EKS/GKE)\":\n\n- **vpc-cni**: v1.20.1-eksbuild.1\n- **coredns**: v1.12.2-eksbuild.4",
      "filesRead": [
        "reference/subsystems/compute-common.md",
        "reference/helm/charts/compute-common-templates-foxtrot-eks.md",
        "reference/helm/index.md",
        "reference/system-architecture.md",
        "reference/configuration.md"
      ],
      "plannedFiles": [
        "reference/subsystems/compute-common.md",
        "reference/helm/charts/compute-common-templates-foxtrot-eks.md",
        "reference/helm/index.md",
        "reference/system-architecture.md",
        "reference/configuration.md"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 5,
        "precision": 5,
        "navigation": 5,
        "notes": "AI answer matches ground truth exactly for both addon versions. Correctly cited source document and section. No hallucinations or extraneous information. Demonstrated direct navigation to the correct file and configuration section."
      },
      "composite": 100,
      "notFound": false
    },
    {
      "id": "services-aws-nat-egress-model",
      "category": "services",
      "difficulty": "medium",
      "question": "What is the AWS NAT egress model for internal VPCs vs production clusters in Foxtrot?",
      "groundTruth": "Internal AWS VPCs use shared NAT egress. Production AWS clusters use dedicated NAT egress per cluster. This split reduces cost for internal environments while isolating production egress and capacity per cluster.",
      "llmAnswer": "**Internal AWS VPCs:** Shared NAT egress (`nat_gateway_configuration = \"all_azs\"` for shared VPCs)\n\n**Production AWS clusters:** Dedicated NAT egress per cluster\n\n**Internal environments:** `nat_gateway_configuration = \"none\"` (no dedicated NAT)\n\nThis split reduces cost for internal environments while isolating production egress and capacity.",
      "filesRead": [
        "reference/network-architecture.md",
        "reference/helm/charts/network-common-charts-foxtrot-aws-vpc.md",
        "reference/helm/charts/compute-common-templates-foxtrot-eks.md",
        "reference/system-architecture.md",
        "reference/subsystems/network-core.md"
      ],
      "plannedFiles": [
        "reference/network-architecture.md",
        "reference/helm/charts/network-common-charts-foxtrot-aws-vpc.md",
        "reference/helm/charts/compute-common-templates-foxtrot-eks.md",
        "reference/system-architecture.md",
        "reference/subsystems/network-core.md"
      ],
      "score": {
        "accuracy": 3,
        "completeness": 4,
        "precision": 2,
        "navigation": 2,
        "notes": "Core facts align: shared NAT for internal VPCs, dedicated for production clusters, cost/isolation benefits stated correctly. However, the answer introduces unverified configuration syntax (`nat_gateway_configuration` parameters) not present in ground truth, creating hallucination risk. Internal environments section contradicts itself (claims both 'shared NAT egress' and 'nat_gateway_configuration = none'). Cannot verify navigation without seeing the source discovery process."
      },
      "composite": 55,
      "notFound": false
    },
    {
      "id": "services-ipam-netbox-role",
      "category": "services",
      "difficulty": "medium",
      "question": "What is NetBox used for in Foxtrot, and what infrastructure does it run on?",
      "groundTruth": "NetBox is Foxtrot's IP address management (IPAM) source of truth. It tracks all VPC CIDR blocks across clouds, prevents CIDR overlap, and serves allocation data to compute-tools during VPC and cluster provisioning. It runs on AWS ECS with an RDS database and ALB, provisioned via Terraform in ipam-core.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation references NetBox only indirectly through file paths (`ipam-core/netbox/rds.tf`) and a sync script (`ipam-tools/netbox_sync/netbox_sync.py`), but does not explicitly describe what NetBox is used for in Foxtrot or provide details about the infrastructure it runs on beyond a reference to RDS backup configuration parameters.",
      "filesRead": [
        "reference/helm/index.md",
        "reference/system-architecture.md",
        "reference/contracts/index.md",
        "reference/subsystems/network-core.md",
        "reference/subsystems/runtime.md"
      ],
      "plannedFiles": [
        "reference/helm/index.md",
        "reference/system-architecture.md",
        "reference/contracts/index.md",
        "reference/subsystems/network-core.md",
        "reference/subsystems/runtime.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI correctly identified that documentation lacks explicit information about NetBox's purpose and infrastructure. While ground truth exists, the agent's 'NOT_FOUND' response reflects the actual documentation gaps—no hallucination occurred, but the agent failed to locate or synthesize available information (file paths, sync scripts) that could have provided partial context."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "contracts-argo-gen-params-required",
      "category": "contracts",
      "difficulty": "easy",
      "question": "What are the required fields in an argo-gen-params.yaml file for the ArgoCD ApplicationSet generator?",
      "groundTruth": "chart.version (string), namespace (string), and environment (string).",
      "llmAnswer": "Based on the configuration reference documentation, every `argo-gen-params.yaml` file requires these three fields:\n\n1. **`chart.version`** (string) — the Helm chart version to deploy\n2. **`namespace`** (string) — target Kubernetes namespace\n3. **`environment`** (string) — target environment identifier",
      "filesRead": [
        "reference/helm/charts/app-common-charts-argocd-apps.md",
        "reference/contracts/index.md",
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/configuration.md"
      ],
      "plannedFiles": [
        "reference/helm/charts/app-common-charts-argocd-apps.md",
        "reference/contracts/index.md",
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/configuration.md"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 5,
        "precision": 5,
        "navigation": 4,
        "notes": "Answer correctly identifies all three required fields with accurate type annotations and reasonable field descriptions. No hallucinations or extraneous information detected. Navigation score reduced slightly due to lack of visible evidence of source consultation, though the correct answer suggests proper information retrieval."
      },
      "composite": 95,
      "notFound": false
    },
    {
      "id": "contracts-azure-xrd-naming",
      "category": "contracts",
      "difficulty": "medium",
      "question": "What is the XRD name format and composition name format for Azure AKS in Foxtrot, with examples?",
      "groundTruth": "XRD name format: `{plural}.{group}`, e.g. `reltioaksclusters.foxtrot.reltio.com`. Composition name format: `reltio-aks-cluster-{channel}-{version}`, e.g. `reltio-aks-cluster-prd-v1`.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation does not contain explicit information about the XRD name format or composition name format for Azure AKS in Foxtrot. While the files reference `foxtrot-aks` as the Helm chart name and mention `compute-common/compositions/reltio-aks-cluster-kcl/` as a composition directory, the specific XRD naming convention and composition name format are not documented in the available reference materials.",
      "filesRead": [
        "reference/helm/charts/compute-common-templates-foxtrot-aks.md",
        "reference/contracts/index.md",
        "reference/subsystems/compute-common.md",
        "reference/system-architecture.md",
        "reference/helm/index.md"
      ],
      "plannedFiles": [
        "reference/helm/charts/compute-common-templates-foxtrot-aks.md",
        "reference/contracts/index.md",
        "reference/subsystems/compute-common.md",
        "reference/system-architecture.md",
        "reference/helm/index.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI correctly identified information was not found in available context but failed to locate the documented XRD and composition naming formats. The answer contains no factual errors (precision maintained by not hallucinating), but provides zero correct information against ground truth specifications."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "contracts-helm-chart-required-values",
      "category": "contracts",
      "difficulty": "medium",
      "question": "What are the five required values that all app Helm charts must define?",
      "groundTruth": "accountID (string), cloudProvider (string), region (string), domain (string), and imageRegistry (string).",
      "llmAnswer": "According to the Configuration Reference, the five required values that all app Helm charts must define are:\n\n1. `replicaCount` — number of pod replicas\n2. `image.repository` — container image repository\n3. `image.tag` — container image tag\n4. `service.port` — service port number\n5. `resources` — CPU/memory resource requests and limits",
      "filesRead": [
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/configuration.md",
        "reference/contracts/index.md",
        "reference/system-architecture.md"
      ],
      "plannedFiles": [
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/configuration.md",
        "reference/contracts/index.md",
        "reference/system-architecture.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 1,
        "navigation": 0,
        "notes": "AI provided generic Helm chart configuration values (replicaCount, image.repository, image.tag, service.port, resources) instead of the system-specific required values (accountID, cloudProvider, region, domain, imageRegistry). Answer demonstrates no evidence of consulting the correct documentation source and appears to be hallucinated generic knowledge rather than domain-specific configuration requirements."
      },
      "composite": 5,
      "notFound": false
    },
    {
      "id": "contracts-sync-wave-ordering",
      "category": "contracts",
      "difficulty": "hard",
      "question": "What are the ArgoCD sync wave values and what resource types are deployed at each wave?",
      "groundTruth": "Wave -5: prerequisites. Wave 0: infrastructure. Wave 5: state apps. Wave 10: application services. Wave 15: post-deployment resources (monitoring, backup hooks, finalizers).",
      "llmAnswer": "Based on the \"Helm Sync Waves (Bootstrapping Order)\" section in the reference documentation:\n\n| Wave | Count | Resources |\n|---|---|---|\n| -1 | 1 | ServiceAccount:pre-sync-app-templates-sa |\n| 0 | 3 | Application:{{, xReltioAKSCluster:{{, ReltioEKSCluster:{{ |\n| 1 | 5 | ClusterRole:pre-sync-app-templates-clusterrole, Application:{{, Workflow:kong-gateway-setup-{{, AccessEntry:{{, Addon:{{ |\n| 2 | 4 | ClusterRoleBinding:pre-sync-app-templates-clusterrolebinding, Workflow:{{, ProviderConfig:{{, ProviderConfig:{{ |\n| 3 | 4 | Workflow:{{, Application:kong-gateway, Release:{{, Release:{{ |\n| 4 | 1 | Release:{{ |\n| 5 | 1 | Job:pre-sync-app-templates-check-job |\n\nThe sync waves establish a bootstrapping order where ServiceAccounts deploy first (wave -1), followed by Applications and cluster infrastructure (wave 0), then RBAC resources (waves 1-2), application workflows and releases (waves 3-4), and finally validation jobs (wave 5).",
      "filesRead": [
        "reference/helm/charts/app-common-charts-argocd-apps.md",
        "reference/subsystems/app-common.md",
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "diagrams/helm-interactions.mmd"
      ],
      "plannedFiles": [
        "reference/helm/charts/app-common-charts-argocd-apps.md",
        "reference/subsystems/app-common.md",
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "diagrams/helm-interactions.mmd"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 1,
        "navigation": 0,
        "notes": "AI answer provides completely different wave numbers (-1, 0-5) than ground truth (-5, 0, 5, 10, 15). Resource categorizations don't align: ground truth uses conceptual groupings (prerequisites, infrastructure, state apps, application services, post-deployment), while AI lists specific resource types at different waves. Missing waves 10 and 15 entirely. Answer appears to reference a different system or hallucinated documentation. No factual overlap with ground truth."
      },
      "composite": 5,
      "notFound": false
    }
  ]
}