dev-intel-v2/eval-ref-pages-v2.json

{
  "evalType": "agent",
  "timestamp": "2026-03-10T18:15:46.607Z",
  "docsDir": "./foxtrot-docs",
  "totalQuestions": 32,
  "overallScore": "52.5",
  "avgAccuracy": "2.31",
  "avgCompleteness": "2.25",
  "avgPrecision": "3.66",
  "avgNavigation": "2.28",
  "notFoundCount": 10,
  "notFoundRate": "31.3%",
  "byCategory": {
    "architecture": {
      "avg": "72.5",
      "count": 6
    },
    "dependencies": {
      "avg": "28.0",
      "count": 5
    },
    "operations": {
      "avg": "51.7",
      "count": 6
    },
    "configuration": {
      "avg": "75.0",
      "count": 6
    },
    "services": {
      "avg": "53.0",
      "count": 5
    },
    "contracts": {
      "avg": "20.0",
      "count": 4
    }
  },
  "byDifficulty": {
    "easy": {
      "avg": "43.0",
      "count": 10
    },
    "medium": {
      "avg": "53.5",
      "count": 17
    },
    "hard": {
      "avg": "68.0",
      "count": 5
    }
  },
  "results": [
    {
      "id": "arch-layered-order",
      "category": "architecture",
      "difficulty": "easy",
      "question": "What are the five layers in Foxtrot's layered architecture, listed from top to bottom?",
      "groundTruth": "Application Layer (app-*), Compute Layer (compute-*), Network Layer (network-*), Account Layer (account-*), Control Plane (control-core).",
      "llmAnswer": "\n\nNOT_FOUND\n\nThe documentation describes a 9-layer architecture, not a 5-layer architecture. The layers listed from top to bottom are: Runtime (shared), Application, Compute, Network, IPAM, Account, Control Plane, Skills/Tooling, and Documentation.",
      "filesRead": [
        "reference/system-architecture.md",
        "reference/index.md",
        "diagrams/system-deps.mmd",
        "reference/helm/index.md",
        "explanation/change-impact.md"
      ],
      "plannedFiles": [
        "reference/system-architecture.md",
        "reference/index.md",
        "diagrams/system-deps.mmd",
        "reference/helm/index.md",
        "explanation/change-impact.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 1,
        "navigation": 0,
        "notes": "AI returned NOT_FOUND then provided incorrect information about a 9-layer architecture that contradicts the ground truth 5-layer specification. The layers described (IPAM, Skills/Tooling, Documentation) don't appear in the ground truth, and the specific naming conventions (app-*, compute-*, etc.) are missing. The agent failed to locate or retrieve the correct documentation."
      },
      "composite": 5,
      "notFound": true
    },
    {
      "id": "arch-hub-spoke-ownership",
      "category": "architecture",
      "difficulty": "easy",
      "question": "Which ArgoCD instance owns the account, network, and compute layers, and which owns the application layer?",
      "groundTruth": "Hub ArgoCD owns account, network, and compute layers (infrastructure and control-plane appsets). Spoke ArgoCD owns the application layer (app-runtime appsets only).",
      "llmAnswer": "Based on the system architecture documentation provided:\n\n**Hub (Infrastructure/Control Plane) owns:**\n- Account layer: `account-creation` ApplicationSet\n- Network layer: `aws-vpcs`, `gcp-vpcs`, `azure-vpcs` ApplicationSets\n- Compute layer: `aws-argocd-feature-clusters`, `gcp-argocd-feature-clusters`, `azure-argocd-feature-clusters` ApplicationSets\n\n**Spoke (Applications/Runtime) owns:**\n- Application layer: managed by spoke cluster ApplicationSets (the specific ApplicationSet names are templated as `{{` in the documentation, but the `argocd-apps` chart is explicitly described as \"Helm template for ArgoCD templates for spoke cluster\")\n\nIn summary: The **Hub** ArgoCD instance owns the account, network, and compute infrastructure layers, while the **Spoke** ArgoCD instances own the application layer.",
      "filesRead": [
        "reference/system-architecture.md",
        "reference/subsystems/control-core.md",
        "reference/helm/index.md",
        "reference/helm/charts/app-common-charts-argocd-apps.md",
        "reference/subsystems/app-common.md"
      ],
      "plannedFiles": [
        "reference/system-architecture.md",
        "reference/subsystems/control-core.md",
        "reference/helm/index.md",
        "reference/helm/charts/app-common-charts-argocd-apps.md",
        "reference/subsystems/app-common.md"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 5,
        "precision": 3,
        "navigation": 4,
        "notes": "Core facts are correct: Hub owns infrastructure layers (account/network/compute), Spoke owns application layer. Answer demonstrates solid architectural understanding and references specific ApplicationSets and charts. However, precision is reduced because specific ApplicationSet names (account-creation, aws-vpcs, etc.) cannot be verified against ground truth and may be partially hallucinated, though the agent acknowledges some templating limitations. Navigation appears competent given correct core answer, though specific file paths aren't shown."
      },
      "composite": 85,
      "notFound": false
    },
    {
      "id": "arch-aws-regions",
      "category": "architecture",
      "difficulty": "easy",
      "question": "What AWS regions does Foxtrot support?",
      "groundTruth": "us-east-1, us-west-2, eu-west-1, eu-central-1.",
      "llmAnswer": "Based on the system architecture documentation, Foxtrot supports the following AWS regions:\n\n- ap-southeast-1\n- eu-central-1\n- eu-west-1\n- eu-west-2\n- eu-west-3\n- us-east-1\n- us-east-2\n- us-west-1\n- us-west-2",
      "filesRead": [
        "reference/helm/charts/compute-common-templates-foxtrot-eks.md",
        "reference/subsystems/compute-common.md",
        "reference/system-architecture.md",
        "reference/helm/index.md"
      ],
      "plannedFiles": [
        "reference/helm/charts/compute-common-templates-foxtrot-eks.md",
        "reference/subsystems/compute-common.md",
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "reference/network-common-charts-foxtrot-aws-vpc.md"
      ],
      "score": {
        "accuracy": 2,
        "completeness": 5,
        "precision": 1,
        "navigation": 2,
        "notes": "AI answer includes all 4 ground truth regions but adds 5 unsupported regions (ap-southeast-1, eu-west-2, eu-west-3, us-east-2, us-west-1), indicating hallucination or unreliable source verification. While completeness is achieved, the answer contains significant false positives that undermine accuracy and precision."
      },
      "composite": 50,
      "notFound": false
    },
    {
      "id": "arch-gcp-shared-vpc-host",
      "category": "architecture",
      "difficulty": "medium",
      "question": "What is the default GCP host project used for Shared VPC in network-common, and what is the host network path it attaches to?",
      "groundTruth": "The default host project is `network-services-436015`. When `sharedVpc.enabled: true`, service project resources attach to the host network path `projects/HOST_PROJECT/global/networks/gcp-core-network`.",
      "llmAnswer": "Based on the Network Architecture Reference documentation:\n\n**Default GCP host project:** `network-services-436015`\n\n**Host network path:** `projects/network-services-436015/global/networks/gcp-core-network`\n\nWhen `sharedVpc.enabled: true`, service project resources attach to this host network path.",
      "filesRead": [
        "reference/subsystems/network-common.md",
        "reference/helm/charts/network-common-charts-foxtrot-gcp-vpc.md",
        "reference/network-architecture.md",
        "reference/helm/index.md",
        "reference/configuration.md"
      ],
      "plannedFiles": [
        "reference/subsystems/network-common.md",
        "reference/helm/charts/network-common-charts-foxtrot-gcp-vpc.md",
        "reference/network-architecture.md",
        "reference/helm/index.md",
        "reference/configuration.md"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 5,
        "precision": 5,
        "navigation": 4,
        "notes": "AI answer correctly identifies the host project ID and provides the fully-resolved network path with the actual project ID substituted. All ground truth elements are covered accurately. The substitution of the concrete project ID into the path template is appropriate and practical. Navigation score is 4 rather than 5 because while the answer demonstrates knowledge of the correct information, there's no visible evidence of the agent actively navigating to the source documentation to retrieve it—the information could derive from training data rather than active retrieval."
      },
      "composite": 95,
      "notFound": false
    },
    {
      "id": "arch-cidr-employee-access",
      "category": "architecture",
      "difficulty": "medium",
      "question": "What is the CIDR range for the employee access (bastions) segment on AWS and on GCP?",
      "groundTruth": "AWS: 10.208.128.0/24. GCP: 10.128.128.0/24.",
      "llmAnswer": "**AWS:** `10.208.128.0/24`\n\n**GCP:** `10.128.128.0/24`",
      "filesRead": [
        "reference/network-architecture.md",
        "reference/subsystems/network-core.md",
        "reference/helm/charts/network-common-charts-foxtrot-aws-vpc.md",
        "reference/helm/charts/network-common-charts-foxtrot-gcp-vpc.md",
        "reference/configuration.md"
      ],
      "plannedFiles": [
        "reference/network-architecture.md",
        "reference/subsystems/network-core.md",
        "reference/helm/charts/network-common-charts-foxtrot-aws-vpc.md",
        "reference/helm/charts/network-common-charts-foxtrot-gcp-vpc.md",
        "reference/configuration.md"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 5,
        "precision": 5,
        "navigation": 5,
        "notes": "AI answer matches ground truth exactly for both AWS (10.208.128.0/24) and GCP (10.128.128.0/24) CIDR ranges. Response is complete, contains no hallucinations or extraneous information, and demonstrates successful retrieval of the correct network configuration data."
      },
      "composite": 100,
      "notFound": false
    },
    {
      "id": "arch-production-cidr",
      "category": "architecture",
      "difficulty": "medium",
      "question": "What is the CIDR range for production workloads on AWS and on GCP?",
      "groundTruth": "AWS: 10.192.0.0/10. GCP: 10.128.0.0/10.",
      "llmAnswer": "**AWS:** `10.192.0.0/10`\n\n**GCP:** `10.128.0.0/10`",
      "filesRead": [
        "reference/network-architecture.md",
        "reference/subsystems/network-core.md",
        "reference/helm/charts/network-common-charts-foxtrot-aws-vpc.md",
        "reference/helm/charts/network-common-charts-foxtrot-gcp-vpc.md",
        "reference/system-architecture.md"
      ],
      "plannedFiles": [
        "reference/network-architecture.md",
        "reference/subsystems/network-core.md",
        "reference/helm/charts/network-common-charts-foxtrot-aws-vpc.md",
        "reference/helm/charts/network-common-charts-foxtrot-gcp-vpc.md",
        "reference/system-architecture.md"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 5,
        "precision": 5,
        "navigation": 5,
        "notes": "AI answer matches ground truth exactly for both AWS and GCP CIDR ranges. No hallucinations, no missing information, no extraneous details. Correct and complete response."
      },
      "composite": 100,
      "notFound": false
    },
    {
      "id": "dep-runtime-common-horizontal",
      "category": "dependencies",
      "difficulty": "easy",
      "question": "Which runtime repositories consume charts from which common repositories?",
      "groundTruth": "app-runtime consumes charts from app-common; compute-runtime consumes charts from compute-common; network-runtime consumes charts from network-common; account-runtime consumes charts from account-common.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation does not explicitly specify which runtime repositories consume charts from which common repositories. While the files show:\n\n- Runtime repository paths (e.g., `runtime/app-runtime/`, `runtime/account-runtime/`, `runtime/compute-runtime/`)\n- Common repository charts (e.g., `app-common/charts/`, `account-common/charts/`, `compute-common/templates/`)\n- References to argo-gen-params.yaml files in runtime paths that suggest chart consumption\n\nThe documentation does not provide a clear, explicit mapping of runtime-to-common repository chart consumption relationships. The expected format is also undefined, making it impossible to provide an answer in the required format.",
      "filesRead": [
        "reference/helm/index.md",
        "reference/subsystems/runtime.md",
        "diagrams/helm-interactions.mmd",
        "reference/system-architecture.md",
        "reference/subsystems/app-common.md"
      ],
      "plannedFiles": [
        "reference/helm/index.md",
        "reference/subsystems/runtime.md",
        "diagrams/helm-interactions.mmd",
        "reference/system-architecture.md",
        "reference/subsystems/app-common.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI agent failed to locate the runtime-to-common repository chart consumption mappings. The ground truth specifies four explicit relationships (app, compute, network, account), but the agent returned NOT_FOUND without providing any of these mappings. While the agent avoided hallucinating false information (hence precision=5), it demonstrated no ability to navigate to or extract the required information from available documentation."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "dep-vertical-layers",
      "category": "dependencies",
      "difficulty": "easy",
      "question": "What are the vertical layer dependencies in Foxtrot's architecture?",
      "groundTruth": "Application Layer depends on Compute Layer; Compute Layer depends on Network Layer; Network Layer depends on Account Layer; Account Layer is managed by the Control Plane.",
      "llmAnswer": "Based on the system architecture documentation, the vertical layer dependencies in Foxtrot's architecture are:\n\n**Layered Architecture (top to bottom):**\n1. Runtime (shared)\n2. Application (app-common, app-tools)\n3. Compute (compute-common, compute-tools)\n4. Network (network-common, network-core)\n5. IPAM (ipam-core, ipam-tools)\n6. Account (account-common)\n7. Control Plane (control-core)\n8. Skills/Tooling (skills)\n9. Documentation (docs)\n\n**Key vertical dependencies:**\n- The `*-tools` subsystems (app-tools, compute-tools, ipam-tools) form a shared operational tier above `root` but below domain consumers\n- ipam-tools depends on both app-tools and compute-tools, reflecting that IP address management must reason about both application topology and compute resource allocation simultaneously\n- All domain subsystems (app-tools, ipam-tools, compute-tools, compute-common, app-common, ipam-core, runtime) depend on `root`, which serves as a cross-cutting coordination point containing shared primitives\n- The architecture favors explicit layering over encapsulation, maintaining a flat dependency graph where infrastructure state and operational tooling remain visibly distinct",
      "filesRead": [
        "reference/system-architecture.md",
        "diagrams/system-deps.mmd",
        "reference/subsystems/root.md",
        "reference/subsystems/control-core.md",
        "explanation/change-impact.md"
      ],
      "plannedFiles": [
        "reference/system-architecture.md",
        "diagrams/system-deps.mmd",
        "reference/subsystems/root.md",
        "reference/subsystems/control-core.md",
        "explanation/change-impact.md"
      ],
      "score": {
        "accuracy": 1,
        "completeness": 2,
        "precision": 1,
        "navigation": 2,
        "notes": "AI answer identifies correct layers but describes fundamentally different dependency relationships. Ground truth specifies linear chain (App→Compute→Network→Account→Control Plane), while AI describes complex graph with root as cross-cutting point and ipam-tools bidirectional dependencies. AI adds unspecified layers (IPAM, Runtime, Skills) and misses the explicit dependency chain. Core vertical dependencies from ground truth are not clearly stated."
      },
      "composite": 30,
      "notFound": false
    },
    {
      "id": "dep-create-account-repos",
      "category": "dependencies",
      "difficulty": "medium",
      "question": "How many PRs does `create-account` produce and which repositories does it touch?",
      "groundTruth": "4 PRs: account-runtime, network-core (account principal updates), control-core (provider config + assume-role policy updates), and cloud-iam (account bootstrap TF).",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation does not contain information about how many PRs `create-account` produces or which repositories it touches.",
      "filesRead": [
        "agent-kb.json",
        "reference/contracts/index.md",
        "reference/helm/charts/account-common-charts-account-creation.md",
        "reference/subsystems/account-common.md",
        "explanation/change-impact.md"
      ],
      "plannedFiles": [
        "agent-kb.json",
        "reference/contracts/index.md",
        "reference/helm/charts/account-common-charts-account-creation.md",
        "reference/subsystems/account-common.md",
        "explanation/change-impact.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI agent failed to locate documentation containing `create-account` PR information. The ground truth specifies 4 PRs across 4 repositories with specific domain context (account principal updates, provider config, assume-role policy, account bootstrap). Agent's NOT_FOUND response indicates inability to navigate to relevant documentation source, though the response itself contains no false information."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "dep-create-cluster-repos",
      "category": "dependencies",
      "difficulty": "medium",
      "question": "Which repositories does `create-cluster` commit to, and how many PRs does it typically produce?",
      "groundTruth": "4 PRs: compute-runtime (cluster path) plus 3 PRs in cloud-auth-core (cluster/node role PR, IRSA role PR, and network role PR). It also calls cluster-netbox-allocate.py to reserve/allocate cluster subnet CIDRs.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation references `create-cluster.sh` in `compute-tools/bash-spoke-setup/scripts/` and shows it contains naming conventions and timeout parameters, but contains no information about which repositories it commits to or how many PRs it typically produces.",
      "filesRead": [
        "reference/contracts/index.md",
        "agent-kb.json",
        "reference/subsystems/control-core.md",
        "reference/system-architecture.md",
        "reference/subsystems/compute-common.md"
      ],
      "plannedFiles": [
        "reference/contracts/index.md",
        "agent-kb.json",
        "reference/subsystems/control-core.md",
        "reference/system-architecture.md",
        "reference/subsystems/compute-common.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 1,
        "notes": "AI correctly identified it could not find the required information and avoided hallucination, but failed to locate the actual documentation containing repository commit details and PR generation specifics. Found a related file (create-cluster.sh) but not the authoritative source documenting the 4-PR pattern across compute-runtime and cloud-auth-core repositories."
      },
      "composite": 30,
      "notFound": true
    },
    {
      "id": "dep-compute-common-deps",
      "category": "dependencies",
      "difficulty": "medium",
      "question": "What are the core dependencies of compute-common for cluster provisioning?",
      "groundTruth": "Crossplane core + provider-aws/provider-kubernetes, Helm, KCL build pipeline, and ArgoCD deployment flow.",
      "llmAnswer": "Based on the documentation provided, the core dependencies of `compute-common` for cluster provisioning are:\n\n**Hard Dependencies:**\n- **root** (29 imports) — foundational configuration and environment resolution primitives that enable cluster lifecycle orchestration and environment-aware test parameterization\n\n**Operational Helpers:**\n- `helm_helper` — Helm chart deployment and management\n- `kubectl_helper` — Kubernetes API interaction\n- `cleanup_tracker` — resource cleanup and teardown orchestration\n\n**Configuration Primitives:**\n- `test_env_config` — environment configuration resolution\n- `test_values_file` — Helm values file management\n\n**Infrastructure Artifacts:**\n- 64 files comprising Helm chart templates and Kubernetes manifests for multi-cloud cluster provisioning (AKS, EKS, GKE)\n\n**Helm Value Contracts:**\n- `foxtrot_aks_values` (accountID, composition, cluster)\n- `foxtrot_eks_values` (accountID, composition, efs, cluster)\n- `foxtrot_gke_values` (cluster)\n\nThe module is intentionally configuration-heavy, with all 14 public functions being test utilities and lifecycle fixtures rather than production logic. State is externalized into cluster and Helm configuration rather than held in-process, which is the correct model for infrastructure orchestration where authoritative state lives in the control plane.",
      "filesRead": [
        "reference/subsystems/compute-common.md",
        "reference/system-architecture.md",
        "diagrams/compute-common-contracts.mmd",
        "diagrams/system-deps.mmd",
        "reference/helm/index.md"
      ],
      "plannedFiles": [
        "reference/subsystems/compute-common.md",
        "reference/system-architecture.md",
        "diagrams/compute-common-contracts.mmd",
        "diagrams/system-deps.mmd",
        "reference/helm/index.md"
      ],
      "score": {
        "accuracy": 2,
        "completeness": 1,
        "precision": 2,
        "navigation": 1,
        "notes": "AI answer identifies Helm and Kubernetes helpers but completely misses the core infrastructure dependencies stated in ground truth: Crossplane core, provider-aws/provider-kubernetes, KCL build pipeline, and ArgoCD. The response conflates test utilities and configuration management with actual provisioning dependencies, suggesting the agent examined test code rather than the provisioning layer. Hallucinated extensive detail about test fixtures and configuration contracts not present in ground truth."
      },
      "composite": 30,
      "notFound": false
    },
    {
      "id": "ops-argocd-deployment-flow",
      "category": "operations",
      "difficulty": "medium",
      "question": "What are the steps in the ArgoCD deployment flow for an application, from ApplicationSet watching app-runtime to pods starting?",
      "groundTruth": "1. ApplicationSet watches app-runtime. 2. Reads argo-gen-params.yaml per environment. 3. Generates Application. 4. Pulls chart from OCI registry. 5. Merges values and overrides. 6. Renders manifests. 7. Applies to cluster. 8. ESO fetches secrets. 9. Pods start with injected configs.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation describes the ArgoCD infrastructure (ApplicationSets, resources generated, sync waves, and deployment topology) but does not contain a detailed step-by-step explanation of the deployment flow from ApplicationSet watching app-runtime to pods starting. The documentation shows that ApplicationSets exist in the argocd-apps chart and that sync waves are defined (waves -1 through 5), but the specific operational steps of how ApplicationSets monitor app-runtime and trigger pod creation are not documented in these files.",
      "filesRead": [
        "reference/subsystems/runtime.md",
        "reference/helm/charts/app-common-charts-argocd-apps.md",
        "reference/system-architecture.md",
        "diagrams/helm-interactions.mmd",
        "reference/helm/index.md"
      ],
      "plannedFiles": [
        "reference/subsystems/runtime.md",
        "reference/helm/charts/app-common-charts-argocd-apps.md",
        "reference/system-architecture.md",
        "diagrams/helm-interactions.mmd",
        "reference/helm/index.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI agent correctly identified that the provided documentation did not contain the specific step-by-step deployment flow from ApplicationSet watching to pod startup. While the agent demonstrated understanding of what was present (ApplicationSets, sync waves, topology), it could not locate or provide the 9-step operational flow specified in ground truth. The 'NOT_FOUND' response is precise (no hallucination) but fails to deliver the required information."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "ops-ebf-release-pattern",
      "category": "operations",
      "difficulty": "medium",
      "question": "What are the steps in the EBF/weekly release pattern?",
      "groundTruth": "1. Create release branch from develop. 2. Promote versions for release/hotfix environments. 3. Run stage pipelines (preview, preprod, production). 4. Merge promoted changes to main. 5. Reconcile hotfix and develop branches.",
      "llmAnswer": "Based on the documentation provided:\n\n**EBF (Emergency Bug Fix) Release Pattern:**\n1. Changes are cherry-picked from `develop`\n2. Changes are verified on EBF verification clusters (via `hotfix` branch in app-runtime)\n3. Changes are promoted to production via `main` branch\n\n**Weekly Release Pattern:**\nThe documentation references a `release` branch in app-runtime that maps to \"release verification (weekly release staging)\" clusters, but the specific steps for the weekly release workflow are not detailed in the provided files.",
      "filesRead": [
        "reference/operations.md",
        "reference/system-architecture.md",
        "reference/subsystems/control-core.md",
        "explanation/change-impact.md",
        "agent-kb.json"
      ],
      "plannedFiles": [
        "reference/operations.md",
        "reference/system-architecture.md",
        "reference/subsystems/control-core.md",
        "explanation/change-impact.md",
        "agent-kb.json"
      ],
      "score": {
        "accuracy": 2,
        "completeness": 1,
        "precision": 3,
        "navigation": 2,
        "notes": "AI found partial information about EBF (cherry-pick, hotfix branch, main promotion) but missed critical steps: version promotion, stage pipelines (preview/preprod/production), and branch reconciliation. Explicitly acknowledged weekly release steps weren't found, showing honesty but indicating incomplete documentation search. Did not locate or reference the authoritative release pattern specification containing all 5 steps."
      },
      "composite": 40,
      "notFound": false
    },
    {
      "id": "ops-rollback-procedure",
      "category": "operations",
      "difficulty": "medium",
      "question": "What is the preferred rollback procedure in Foxtrot, and when are script-level deletes used instead?",
      "groundTruth": "GitOps rollback is preferred: revert the runtime/config commit, merge the revert PR, and confirm ArgoCD reconciliation. Script-level deletes (delete-vpc.sh, delete-cluster.sh, delete-app.sh) are used only with explicit approval and change control.",
      "llmAnswer": "Based on the provided documentation, I can answer the first part of your question:\n\n**Preferred Rollback Procedure:**\nRollback is performed by reverting the `chart.version` in `argo-gen-params.yaml` to the previous known-good version. ArgoCD detects the change and re-syncs the application with the prior chart version from the OCI registry.\n\n**Regarding script-level deletes:**\nNOT_FOUND - The documentation does not contain information about when script-level deletes are used as an alternative to the standard rollback procedure.",
      "filesRead": [
        "reference/operations.md",
        "reference/system-architecture.md",
        "explanation/change-impact.md",
        "reference/subsystems/control-core.md",
        "reference/helm/index.md"
      ],
      "plannedFiles": [
        "reference/operations.md",
        "reference/system-architecture.md",
        "explanation/change-impact.md",
        "reference/subsystems/control-core.md",
        "reference/helm/index.md"
      ],
      "score": {
        "accuracy": 2,
        "completeness": 1,
        "precision": 3,
        "navigation": 1,
        "notes": "AI provided a GitOps-aligned rollback approach but diverged from ground truth's specific procedure (revert commit → merge PR → confirm reconciliation). Completely failed to locate script-level delete information, returning NOT_FOUND instead of the documented conditions for their use. Poor documentation navigation and incomplete coverage of both question components."
      },
      "composite": 35,
      "notFound": true
    },
    {
      "id": "ops-branch-cluster-mapping",
      "category": "operations",
      "difficulty": "easy",
      "question": "What is the branch-to-cluster mapping in app-runtime? Which branch targets which environments?",
      "groundTruth": "develop → internal clusters (development and QA); main → customer-facing clusters (production); hotfix → EBF verification; release → release verification (weekly release staging).",
      "llmAnswer": "## Branch to Cluster Mapping (app-runtime)\n\n- `develop` → internal clusters (development and QA)\n- `main` → customer-facing clusters (production)\n- `hotfix` → EBF verification\n- `release` → release verification (weekly release staging)",
      "filesRead": [
        "reference/subsystems/runtime.md",
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "reference/operations.md",
        "reference/subsystems/app-common.md"
      ],
      "plannedFiles": [
        "reference/subsystems/runtime.md",
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "reference/operations.md",
        "reference/subsystems/app-common.md"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 5,
        "precision": 5,
        "navigation": 5,
        "notes": "AI answer matches ground truth exactly across all four branch mappings with no omissions, hallucinations, or inaccuracies. Perfect alignment on all dimensions."
      },
      "composite": 100,
      "notFound": false
    },
    {
      "id": "ops-jenkins-jobs",
      "category": "operations",
      "difficulty": "hard",
      "question": "What are the six Jenkins jobs in the Foxtrot pipeline and what does each do?",
      "groundTruth": "appRuntimeSync: syncs runtime branches on changes to main/hotfix/release. foxtrotSync: updates imagetags-override.yaml with new image tags. foxtrotPromote: promotes and merges release outputs. chartSync: updates argo-gen-params.yaml with chart versions. chartCheckInAppCommon: detects chart version mismatches and triggers chartSync. chartPromoteVersion: promotes chart versions per release stage.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation does not contain information about six Jenkins jobs in a Foxtrot pipeline. While the files reference Jenkins as a Helm chart and mention Foxtrot-related infrastructure templates (foxtrot-eks, foxtrot-aks, foxtrot-gke, etc.), there is no documentation describing specific Jenkins pipeline jobs or their purposes.",
      "filesRead": [
        "reference/helm/charts/app-common-charts-jenkins.md",
        "reference/subsystems/app-common.md",
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "reference/contracts/index.md"
      ],
      "plannedFiles": [
        "reference/helm/charts/app-common-charts-jenkins.md",
        "reference/subsystems/app-common.md",
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "reference/contracts/index.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "Agent correctly identified that the information was not found in available documentation, avoiding hallucination. However, failed to locate the source material containing the six Jenkins job definitions and their purposes. The honest 'NOT_FOUND' response demonstrates precision but indicates the agent did not navigate to the correct documentation."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "ops-create-cluster-timeout",
      "category": "operations",
      "difficulty": "medium",
      "question": "What is the timeout for waiting for a cluster to reach ready condition in the create-cluster workflow?",
      "groundTruth": "3600 seconds (1 hour), with a poll interval of 30 seconds.",
      "llmAnswer": "Based on the information in the provided documentation, specifically from the \"Configuration & Operational Defaults\" section in reference/system-architecture.md:\n\n**MAX_WAIT = 3600** (seconds)\n\nThis is listed as a timeout parameter for `compute-tools/bash-spoke-setup/scripts/create-cluster.sh`, which represents the maximum wait time for cluster operations, including reaching ready condition.",
      "filesRead": [
        "reference/contracts/index.md",
        "reference/subsystems/control-core.md",
        "reference/helm/charts/app-common-charts-cluster.md",
        "reference/system-architecture.md",
        "diagrams/all-contracts.mmd"
      ],
      "plannedFiles": [
        "reference/contracts/index.md",
        "reference/subsystems/control-core.md",
        "reference/helm/charts/app-common-charts-cluster.md",
        "reference/system-architecture.md",
        "diagrams/all-contracts.mmd"
      ],
      "score": {
        "accuracy": 4,
        "completeness": 3,
        "precision": 5,
        "navigation": 5,
        "notes": "AI correctly identified the 3600-second timeout from the right documentation source and file section. However, it missed the poll interval of 30 seconds that is part of the complete ground truth answer. Navigation was excellent—went directly to the correct file and section. No hallucinations or incorrect information present."
      },
      "composite": 85,
      "notFound": false
    },
    {
      "id": "config-cloud-resource-naming",
      "category": "configuration",
      "difficulty": "easy",
      "question": "What is the canonical format for cloud resource names in Foxtrot, and what does each component represent?",
      "groundTruth": "Format: `{provider}-{phase}-{region-code}-{index}-{resource}[-{descriptor}]`. provider: aws/gcp/azr; phase: prd/iqa/dev; region-code: two-letter derived code; index: incremental integer; resource: vpc/eks/gke/aks/lb/rds. Example: `gcp-prd-us-1-vpc-blue`.",
      "llmAnswer": "Based on the system-architecture.md documentation, Foxtrot uses the following canonical naming conventions for cloud resources:\n\n**AWS Resources:**\n- **EKS Clusters:** `aws-{phase}-{region-code}-{index}-eks`\n- **VPCs:** `aws-{phase}-{region-code}-{index}-vpc`\n\n**Azure Resources:**\n- **AKS Clusters:** `azr-{phase}-{region-code}-{index}-aks`\n- **VNets:** `azr-{phase}-{region-code}-{index}-vnet`\n\n**Component Breakdown:**\n- `aws` / `azr` — Cloud provider prefix (AWS or Azure)\n- `{phase}` — Deployment phase (e.g., dev, prd)\n- `{region-code}` — Region identifier (e.g., us-east-1, eu-west-1)\n- `{index}` — Numeric index for resource disambiguation\n- Resource type suffix — Service type (eks, aks, vpc, vnet)",
      "filesRead": [
        "reference/system-architecture.md",
        "reference/subsystems/compute-common.md",
        "reference/network-architecture.md",
        "reference/helm/index.md",
        "agent-kb.json"
      ],
      "plannedFiles": [
        "reference/system-architecture.md",
        "reference/subsystems/compute-common.md",
        "reference/network-architecture.md",
        "reference/helm/index.md",
        "agent-kb.json"
      ],
      "score": {
        "accuracy": 2,
        "completeness": 2,
        "precision": 2,
        "navigation": 2,
        "notes": "AI captures the basic format structure correctly but contains significant gaps and errors. Missing: GCP provider entirely, iqa phase, resource types (gke, lb, rds), optional descriptor component, and canonical example. Critical error: region code examples show full region names (us-east-1) instead of two-letter codes as specified in ground truth. Includes 'vnet' not in ground truth list. Answer demonstrates partial knowledge but fails to cover the complete specification."
      },
      "composite": 40,
      "notFound": false
    },
    {
      "id": "config-region-code-algorithm",
      "category": "configuration",
      "difficulty": "medium",
      "question": "What is the algorithm for deriving a two-letter region code from a cloud region name, and what is the region code for `westeurope`?",
      "groundTruth": "Algorithm: 1. Remove directional words (east, west, north, south). 2. Remove separators. 3. Take first two characters. For `westeurope`: remove 'west' → 'europe' → 'eu'.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation references a \"region code algorithm\" in `reference/configuration.md`, but that file's content was not included in the materials provided. The system architecture documentation shows naming conventions that use region codes (e.g., `aws-{phase}-{region-code}-{index}-eks`), but the algorithm for deriving the two-letter codes and the specific code for `westeurope` are not present in the available files.",
      "filesRead": [
        "reference/subsystems/compute-common.md",
        "reference/system-architecture.md",
        "reference/subsystems/network-common.md",
        "reference/index.md",
        "agent-kb.json"
      ],
      "plannedFiles": [
        "reference/subsystems/compute-common.md",
        "reference/system-architecture.md",
        "reference/subsystems/network-common.md",
        "reference/index.md",
        "agent-kb.json"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI agent correctly identified information was unavailable rather than hallucinating an answer, but failed to retrieve the algorithm and region code. The response 'NOT_FOUND' is appropriate given missing documentation, but represents complete failure to answer the question. No navigation to relevant files occurred."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "config-app-config-merge-order",
      "category": "configuration",
      "difficulty": "medium",
      "question": "What is the configuration merge order for app charts in Foxtrot, from lowest to highest precedence?",
      "groundTruth": "1. values.yaml, 2. default-properties.yaml, 3. default-values.yaml / k8s-values.yaml, 4. common-values.yaml, 5. properties-override.yaml, 6. k8s-override.yaml, 7. imagetags-override.yaml.",
      "llmAnswer": "From lowest to highest precedence:\n\n1. `values.yaml` (chart defaults)\n2. `default-properties.yaml`\n3. `default-values.yaml` / `k8s-values.yaml`\n4. `common-values.yaml`\n5. `properties-override.yaml`\n6. `k8s-override.yaml`\n7. `imagetags-override.yaml` (highest precedence)",
      "filesRead": [
        "reference/configuration.md",
        "reference/subsystems/app-common.md",
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "explanation/change-impact.md"
      ],
      "plannedFiles": [
        "reference/configuration.md",
        "reference/subsystems/app-common.md",
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "explanation/change-impact.md"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 5,
        "precision": 5,
        "navigation": 5,
        "notes": "AI answer matches ground truth exactly in both content and order. All 7 configuration files listed in correct precedence sequence. Helpful annotations (chart defaults, highest precedence) add clarity without introducing hallucinations or inaccuracies."
      },
      "composite": 100,
      "notFound": false
    },
    {
      "id": "config-account-creation-product-id",
      "category": "configuration",
      "difficulty": "hard",
      "question": "What is the AWS Service Catalog product ID used by account-common for account creation, and what are the customer-facing and internal OU IDs?",
      "groundTruth": "Service Catalog product ID: `prod-mts6togilnnuk`. Customer-facing OU: `ou-8vpz-mogk4qua`. Internal OU: `ou-8vpz-rw7etshv`. Provider config reference: `aws-709659200419`.",
      "llmAnswer": "Based on the configuration reference documentation:\n\n**AWS Service Catalog Product ID for account creation:** `prod-mts6togilnnuk`\n\n**Customer-facing OU ID:** `ou-8vpz-mogk4qua`\n\n**Internal OU ID:** `ou-8vpz-rw7etshv`",
      "filesRead": [
        "reference/subsystems/account-common.md",
        "reference/helm/charts/account-common-charts-account-creation.md",
        "reference/contracts/index.md",
        "reference/configuration.md",
        "agent-kb.json"
      ],
      "plannedFiles": [
        "reference/subsystems/account-common.md",
        "reference/helm/charts/account-common-charts-account-creation.md",
        "reference/contracts/index.md",
        "reference/configuration.md",
        "agent-kb.json"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 3,
        "precision": 5,
        "navigation": 4,
        "notes": "AI correctly identified all three primary identifiers (product ID, customer-facing OU, internal OU) with exact matches to ground truth. However, omitted the provider config reference (aws-709659200419) that was part of the complete ground truth. Answer is accurate and free of hallucination, but incomplete as it covers 3 of 4 expected items."
      },
      "composite": 85,
      "notFound": false
    },
    {
      "id": "config-ipam-rds-backup",
      "category": "configuration",
      "difficulty": "hard",
      "question": "What are the RDS automated backup retention period and backup window configured in ipam-core?",
      "groundTruth": "RDS automated backup retention: 7 days. RDS backup window: 03:00-06:00 UTC.",
      "llmAnswer": "\n\nBased on the system architecture documentation, the RDS automated backup configuration in ipam-core is:\n\n- **backup_retention_period:** 7 days\n- **backup_window:** 03:00-06:00 (UTC)\n\nThese values are configured in `ipam-core/netbox/rds.tf`.",
      "filesRead": [
        "reference/subsystems/ipam-core.md",
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "reference/system-architecture.md",
        "agent-kb.json"
      ],
      "plannedFiles": [
        "reference/subsystems/ipam-core.md",
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "reference/system-architecture.md",
        "agent-kb.json"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 5,
        "precision": 5,
        "navigation": 5,
        "notes": "AI answer matches ground truth exactly on both metrics (7 days retention, 03:00-06:00 UTC window). Correctly identified the configuration file location. No hallucinations or extraneous information present."
      },
      "composite": 100,
      "notFound": false
    },
    {
      "id": "config-dev-artifact-naming",
      "category": "configuration",
      "difficulty": "medium",
      "question": "What is the artifact naming convention for development chart versions vs release chart versions in the OCI registry?",
      "groundTruth": "Development artifacts: `{chart-name}:0.0.0-{commit-sha}`. Release artifacts: `{chart-name}:{chart-version}` (semantic version from Chart.yaml).",
      "llmAnswer": "Development artifacts: `{chart-name}:0.0.0-{commit-sha}`\n\nRelease artifacts: `{chart-name}:{chart-version}` (semantic version from Chart.yaml)",
      "filesRead": [
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "reference/system-architecture.md",
        "reference/configuration.md",
        "reference/subsystems/app-common.md"
      ],
      "plannedFiles": [
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "reference/system-architecture.md",
        "reference/configuration.md",
        "reference/subsystems/app-common.md"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 5,
        "precision": 5,
        "navigation": 5,
        "notes": "AI answer is verbatim match to ground truth. Both development and release artifact naming conventions are correctly stated with proper formatting and context. No extraneous information or hallucinations present."
      },
      "composite": 100,
      "notFound": false
    },
    {
      "id": "services-tech-stack-orchestration",
      "category": "services",
      "difficulty": "easy",
      "question": "What technologies make up the orchestration layer in Foxtrot's technology stack?",
      "groundTruth": "Kubernetes (EKS, GKE, AKS), ArgoCD, and Crossplane.",
      "llmAnswer": "Based on the documentation provided, the orchestration layer in Foxtrot's technology stack consists of:\n\n1. **ArgoCD** - GitOps-based deployment orchestration using ApplicationSets to manage hub-and-spoke deployment topology across AWS, GCP, and Azure\n2. **Argo Workflows** - Workflow orchestration for multi-step operational tasks\n3. **Argo Events** - Event-driven automation and triggering\n4. **Crossplane** - Declarative infrastructure provisioning and orchestration with multiple cloud provider packages (AWS, Azure, GCP)\n\nThese technologies are centralized in the `control-core` subsystem, which serves as the control plane orchestration layer managing deployment topology, infrastructure lifecycle, and operational workflows across the multi-cloud platform.",
      "filesRead": [
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "reference/subsystems/control-core.md",
        "reference/subsystems/app-common.md",
        "diagrams/helm-interactions.mmd"
      ],
      "plannedFiles": [
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "reference/subsystems/control-core.md",
        "reference/subsystems/app-common.md",
        "diagrams/helm-interactions.mmd"
      ],
      "score": {
        "accuracy": 2,
        "completeness": 2,
        "precision": 2,
        "navigation": 2,
        "notes": "AI correctly identified ArgoCD and Crossplane from ground truth but critically missed Kubernetes (EKS, GKE, AKS) entirely. Added Argo Workflows and Argo Events which are not in the ground truth, suggesting hallucination of related Argo products. Partial success in finding correct technologies but incomplete coverage and inclusion of incorrect information significantly undermines the answer."
      },
      "composite": 40,
      "notFound": false
    },
    {
      "id": "services-state-management",
      "category": "services",
      "difficulty": "easy",
      "question": "What are the four state management services used in Foxtrot?",
      "groundTruth": "Elasticsearch, Hazelcast, Redis, and Milvus.",
      "llmAnswer": "Based on the documentation provided, the \"Configuration & Operational Defaults\" section lists the following state management services:\n\n**elasticsearch, hazelcast, cassandra, milvus, redis**\n\nHowever, this lists **five** services, not four. The documentation does not specify exactly four state management services for Foxtrot. If forced to select four from the five listed, the most commonly referenced ones in the Helm charts are: **elasticsearch, hazelcast, cassandra, and redis**.",
      "filesRead": [
        "reference/system-architecture.md",
        "reference/subsystems/app-common.md",
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "diagrams/all-contracts.mmd"
      ],
      "plannedFiles": [
        "reference/system-architecture.md",
        "reference/subsystems/app-common.md",
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "diagrams/all-contracts.mmd"
      ],
      "score": {
        "accuracy": 1,
        "completeness": 1,
        "precision": 2,
        "navigation": 2,
        "notes": "AI listed 5 services instead of 4, including Cassandra which is not in ground truth. Final selection (elasticsearch, hazelcast, cassandra, redis) excludes Milvus and includes incorrect Cassandra. Only 2 of 4 correct services identified. Agent found documentation but selected wrong services."
      },
      "composite": 30,
      "notFound": false
    },
    {
      "id": "services-eks-addon-versions",
      "category": "services",
      "difficulty": "hard",
      "question": "What are the default pinned versions for the vpc-cni and coredns EKS addons in compute-common?",
      "groundTruth": "vpc-cni: v1.20.1-eksbuild.1. coredns: v1.12.2-eksbuild.4.",
      "llmAnswer": "Based on the information in the provided files, specifically from the \"Configuration & Operational Defaults\" section in `reference/system-architecture.md`:\n\n- **vpc-cni**: v1.20.1-eksbuild.1\n- **coredns**: v1.12.2-eksbuild.4",
      "filesRead": [
        "reference/helm/charts/compute-common-templates-foxtrot-eks.md",
        "reference/subsystems/compute-common.md",
        "reference/helm/index.md",
        "reference/system-architecture.md",
        "reference/configuration.md"
      ],
      "plannedFiles": [
        "reference/helm/charts/compute-common-templates-foxtrot-eks.md",
        "reference/subsystems/compute-common.md",
        "reference/helm/index.md",
        "reference/system-architecture.md",
        "reference/configuration.md"
      ],
      "score": {
        "accuracy": 5,
        "completeness": 5,
        "precision": 5,
        "navigation": 5,
        "notes": "AI answer matches ground truth exactly on both addon versions. Properly cited source file and section, demonstrating effective information retrieval. No hallucinations or extraneous details."
      },
      "composite": 100,
      "notFound": false
    },
    {
      "id": "services-aws-nat-egress-model",
      "category": "services",
      "difficulty": "medium",
      "question": "What is the AWS NAT egress model for internal VPCs vs production clusters in Foxtrot?",
      "groundTruth": "Internal AWS VPCs use shared NAT egress. Production AWS clusters use dedicated NAT egress per cluster. This split reduces cost for internal environments while isolating production egress and capacity per cluster.",
      "llmAnswer": "**Internal AWS VPCs:** Shared NAT egress (`nat_gateway_configuration = \"all_azs\"` for shared VPCs)\n\n**Production AWS clusters:** Dedicated NAT egress per cluster\n\n**Internal environments:** `nat_gateway_configuration = \"none\"` (no dedicated NAT)\n\nThis split reduces cost for internal environments while isolating production egress and capacity.",
      "filesRead": [
        "reference/helm/charts/network-common-charts-foxtrot-aws-vpc.md",
        "reference/network-architecture.md",
        "reference/system-architecture.md",
        "reference/subsystems/network-core.md",
        "reference/helm/charts/compute-common-templates-foxtrot-eks.md"
      ],
      "plannedFiles": [
        "reference/helm/charts/network-common-charts-foxtrot-aws-vpc.md",
        "reference/network-architecture.md",
        "reference/system-architecture.md",
        "reference/subsystems/network-core.md",
        "reference/helm/charts/compute-common-templates-foxtrot-eks.md"
      ],
      "score": {
        "accuracy": 3,
        "completeness": 4,
        "precision": 2,
        "navigation": 2,
        "notes": "Core facts align: shared NAT for internal VPCs, dedicated for production clusters, cost/isolation benefits stated correctly. However, the answer introduces unverified configuration syntax (`nat_gateway_configuration` parameters) not present in ground truth, creating hallucination risk. Internal environments section contradicts itself (claims both 'shared NAT egress' and 'nat_gateway_configuration = none'). Cannot verify navigation without seeing the source discovery process."
      },
      "composite": 55,
      "notFound": false
    },
    {
      "id": "services-ipam-netbox-role",
      "category": "services",
      "difficulty": "medium",
      "question": "What is NetBox used for in Foxtrot, and what infrastructure does it run on?",
      "groundTruth": "NetBox is Foxtrot's IP address management (IPAM) source of truth. It tracks all VPC CIDR blocks across clouds, prevents CIDR overlap, and serves allocation data to compute-tools during VPC and cluster provisioning. It runs on AWS ECS with an RDS database and ALB, provisioned via Terraform in ipam-core.",
      "llmAnswer": "Based on the provided documentation:\n\nNetBox is used for **IPAM (IP Address Management)** in Foxtrot. It is part of the `ipam-core` subsystem and is synchronized through `ipam-tools/netbox_sync/netbox_sync.py`.\n\n**Infrastructure:** NetBox runs on **RDS (Relational Database Service)**, as indicated by references to `ipam-core/netbox/rds.tf` with the following configuration:\n- **backup_retention_period:** 7 days\n- **backup_window:** 03:00-06:00 UTC\n\nThe IPAM layer (which includes NetBox) is positioned to reason about both application topology and compute resource allocation, as `ipam-tools` depends on both `app-tools` and `compute-tools`.",
      "filesRead": [
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "reference/subsystems/network-core.md",
        "reference/contracts/index.md",
        "agent-kb.json"
      ],
      "plannedFiles": [
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "reference/subsystems/network-core.md",
        "reference/contracts/index.md",
        "agent-kb.json"
      ],
      "score": {
        "accuracy": 2,
        "completeness": 1,
        "precision": 3,
        "navigation": 2,
        "notes": "AI correctly identifies NetBox as IPAM and locates RDS configuration, but critically omits ECS and ALB infrastructure components. Missing key domain logic: VPC CIDR tracking, overlap prevention, and compute-tools allocation serving. Answer is incomplete and would mislead someone about NetBox's actual infrastructure footprint. Navigation found some relevant files but missed the complete provisioning picture."
      },
      "composite": 40,
      "notFound": false
    },
    {
      "id": "contracts-argo-gen-params-required",
      "category": "contracts",
      "difficulty": "easy",
      "question": "What are the required fields in an argo-gen-params.yaml file for the ArgoCD ApplicationSet generator?",
      "groundTruth": "chart.version (string), namespace (string), and environment (string).",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation does not contain information about the required fields in an argo-gen-params.yaml file for the ArgoCD ApplicationSet generator. While the argocd-apps Helm chart documentation mentions that it generates ApplicationSet resources, the specific schema and required fields for argo-gen-params.yaml are not documented in the available files.",
      "filesRead": [
        "reference/helm/charts/app-common-charts-argocd-apps.md",
        "reference/subsystems/app-common.md",
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "agent-kb.json"
      ],
      "plannedFiles": [
        "reference/helm/charts/app-common-charts-argocd-apps.md",
        "reference/subsystems/app-common.md",
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "agent-kb.json"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "Agent correctly avoided hallucination by stating information was not found rather than inventing details. However, it failed to locate the ground truth facts (chart.version, namespace, environment as required fields). The responsible 'NOT_FOUND' response prevents false information but indicates the agent did not successfully navigate to or identify the correct documentation source containing the argo-gen-params.yaml schema."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "contracts-azure-xrd-naming",
      "category": "contracts",
      "difficulty": "medium",
      "question": "What is the XRD name format and composition name format for Azure AKS in Foxtrot, with examples?",
      "groundTruth": "XRD name format: `{plural}.{group}`, e.g. `reltioaksclusters.foxtrot.reltio.com`. Composition name format: `reltio-aks-cluster-{channel}-{version}`, e.g. `reltio-aks-cluster-prd-v1`.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation does not contain explicit information about XRD name formats or composition name formats for Azure AKS in Foxtrot. While the system architecture document references a cluster naming convention (`azr-{phase}-{region-code}-{index}-aks`) and there is a composition directory reference (`reltio-aks-cluster-kcl`), the specific XRD and composition name formats are not documented in the available files.",
      "filesRead": [
        "reference/helm/charts/compute-common-templates-foxtrot-aks.md",
        "reference/contracts/index.md",
        "reference/system-architecture.md",
        "reference/subsystems/compute-common.md",
        "reference/helm/charts/network-common-charts-foxtrot-azure-vnet.md"
      ],
      "plannedFiles": [
        "reference/helm/charts/compute-common-templates-foxtrot-aks.md",
        "reference/contracts/index.md",
        "reference/system-architecture.md",
        "reference/subsystems/compute-common.md",
        "reference/helm/charts/network-common-charts-foxtrot-azure-vnet.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI answered NOT_FOUND and did not provide the ground truth information about XRD name format ({plural}.{group}, e.g., reltioaksclusters.foxtrot.reltio.com) or composition name format (reltio-aks-cluster-{channel}-{version}, e.g., reltio-aks-cluster-prd-v1). While the response was honest about documentation gaps and found tangentially related information, it failed to locate or provide the specific formats requested. No hallucination occurred, but the core requirement was not met."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "contracts-helm-chart-required-values",
      "category": "contracts",
      "difficulty": "medium",
      "question": "What are the five required values that all app Helm charts must define?",
      "groundTruth": "accountID (string), cloudProvider (string), region (string), domain (string), and imageRegistry (string).",
      "llmAnswer": "According to the Helm Chart Required Values section in the configuration reference, all app Helm charts must define these five required values:\n\n1. `replicaCount` — number of pod replicas\n2. `image.repository` — container image repository\n3. `image.tag` — container image tag\n4. `service.port` — service port number\n5. `resources` — CPU/memory resource requests and limits",
      "filesRead": [
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/contracts/index.md",
        "reference/helm/charts/app-common-charts-cluster.md",
        "reference/configuration.md"
      ],
      "plannedFiles": [
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/contracts/index.md",
        "reference/helm/charts/app-common-charts-cluster.md",
        "reference/configuration.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 0,
        "navigation": 0,
        "notes": "The AI provided five completely incorrect values (replicaCount, image.repository, image.tag, service.port, resources) that bear no relation to the ground truth (accountID, cloudProvider, region, domain, imageRegistry). This appears to be a hallucination of generic Helm chart values rather than the specific required configuration values. The agent failed to locate or consult the correct documentation source."
      },
      "composite": 0,
      "notFound": false
    },
    {
      "id": "contracts-sync-wave-ordering",
      "category": "contracts",
      "difficulty": "hard",
      "question": "What are the ArgoCD sync wave values and what resource types are deployed at each wave?",
      "groundTruth": "Wave -5: prerequisites. Wave 0: infrastructure. Wave 5: state apps. Wave 10: application services. Wave 15: post-deployment resources (monitoring, backup hooks, finalizers).",
      "llmAnswer": "Based on the \"Helm Sync Waves (Bootstrapping Order)\" table in reference/helm/index.md:\n\n| Wave | Count | Resources |\n|---|---|---|\n| -1 | 1 | ServiceAccount:pre-sync-app-templates-sa |\n| 0 | 3 | Application:{{, xReltioAKSCluster:{{, ReltioEKSCluster:{{ |\n| 1 | 5 | ClusterRole:pre-sync-app-templates-clusterrole, Application:{{, Workflow:kong-gateway-setup-{{, AccessEntry:{{, Addon:{{ |\n| 2 | 4 | ClusterRoleBinding:pre-sync-app-templates-clusterrolebinding, Workflow:{{, ProviderConfig:{{, ProviderConfig:{{ |\n| 3 | 4 | Workflow:{{, Application:kong-gateway, Release:{{, Release:{{ |\n| 4 | 1 | Release:{{ |\n| 5 | 1 | Job:pre-sync-app-templates-check-job |\n\nThe sync waves range from -1 to 5, with wave -1 executing first (ServiceAccount creation), followed by progressive waves that deploy Applications, Workflows, ProviderConfigs, Releases, and finally a validation Job at wave 5.",
      "filesRead": [
        "reference/helm/charts/app-common-charts-argocd-apps.md",
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "diagrams/helm-interactions.mmd",
        "reference/system-architecture.md"
      ],
      "plannedFiles": [
        "reference/helm/charts/app-common-charts-argocd-apps.md",
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "diagrams/helm-interactions.mmd",
        "reference/system-architecture.md"
      ],
      "score": {
        "accuracy": 1,
        "completeness": 1,
        "precision": 2,
        "navigation": 2,
        "notes": "The AI answer describes sync waves -1 through 5, while ground truth specifies waves -5, 0, 5, 10, and 15. The wave values are fundamentally misaligned. The AI found a structured source (reference/helm/index.md) and extracted data competently, but this appears to be either the wrong source or a different system configuration entirely. Missing waves 10 and 15 entirely. The detailed resource listings don't correspond to the ground truth's high-level categorization (prerequisites, infrastructure, state apps, application services, post-deployment resources)."
      },
      "composite": 30,
      "notFound": false
    }
  ]
}