dev-intel-v2/eval-v2-baseline.json

{
  "evalType": "agent",
  "timestamp": "2026-03-10T05:15:30.984Z",
  "docsDir": "./foxtrot-docs",
  "totalQuestions": 32,
  "overallScore": "26.4",
  "avgAccuracy": "0.34",
  "avgCompleteness": "0.28",
  "avgPrecision": "4.25",
  "avgNavigation": "0.41",
  "notFoundCount": 26,
  "notFoundRate": "81.3%",
  "byCategory": {
    "architecture": {
      "avg": "25.8",
      "count": 6
    },
    "dependencies": {
      "avg": "26.0",
      "count": 5
    },
    "operations": {
      "avg": "25.0",
      "count": 6
    },
    "configuration": {
      "avg": "22.5",
      "count": 6
    },
    "services": {
      "avg": "36.0",
      "count": 5
    },
    "contracts": {
      "avg": "23.8",
      "count": 4
    }
  },
  "byDifficulty": {
    "easy": {
      "avg": "27.5",
      "count": 10
    },
    "medium": {
      "avg": "25.6",
      "count": 17
    },
    "hard": {
      "avg": "27.0",
      "count": 5
    }
  },
  "results": [
    {
      "id": "arch-layered-order",
      "category": "architecture",
      "difficulty": "easy",
      "question": "What are the five layers in Foxtrot's layered architecture, listed from top to bottom?",
      "groundTruth": "Application Layer (app-*), Compute Layer (compute-*), Network Layer (network-*), Account Layer (account-*), Control Plane (control-core).",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation does not contain information about five layers in Foxtrot's layered architecture listed from top to bottom. The system architecture document describes subsystems organized around different domains (account, compute, network, application) and distinguishes between declarative configuration subsystems and imperative tooling subsystems, but does not present a five-layer architectural model.",
      "filesRead": [
        "reference/system-architecture.md",
        "reference/subsystems/root.md",
        "diagrams/system-deps.mmd",
        "reference/helm/index.md",
        "explanation/change-impact.md"
      ],
      "plannedFiles": [
        "reference/system-architecture.md",
        "reference/subsystems/root.md",
        "diagrams/system-deps.mmd",
        "reference/helm/index.md",
        "explanation/change-impact.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI failed to locate the documented five-layer architecture. Ground truth explicitly defines the layers (Application, Compute, Network, Account, Control Plane), but the agent reported NOT_FOUND. No hallucination occurred, but the core task—finding and reporting the correct architectural layers—was not accomplished."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "arch-hub-spoke-ownership",
      "category": "architecture",
      "difficulty": "easy",
      "question": "Which ArgoCD instance owns the account, network, and compute layers, and which owns the application layer?",
      "groundTruth": "Hub ArgoCD owns account, network, and compute layers (infrastructure and control-plane appsets). Spoke ArgoCD owns the application layer (app-runtime appsets only).",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation describes multiple ArgoCD instances across different environments (control-plane-compute-hub, hub-dev, hub-iqa, hub-prd) and references various ApplicationSets for account creation, VPCs, and feature clusters, but does not explicitly state which ArgoCD instance owns the account, network, and compute layers versus the application layer.",
      "filesRead": [
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "reference/subsystems/control-core.md",
        "diagrams/helm-interactions.mmd",
        "reference/subsystems/app-common.md"
      ],
      "plannedFiles": [
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "reference/subsystems/control-core.md",
        "diagrams/helm-interactions.mmd",
        "reference/subsystems/app-common.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI correctly avoided hallucination by returning NOT_FOUND rather than inventing an answer. However, it failed to locate the ground truth information about Hub ArgoCD owning infrastructure/control-plane layers and Spoke ArgoCD owning the application layer. The agent did not demonstrate successful navigation to the relevant documentation or configuration files that contain this ownership model."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "arch-aws-regions",
      "category": "architecture",
      "difficulty": "easy",
      "question": "What AWS regions does Foxtrot support?",
      "groundTruth": "us-east-1, us-west-2, eu-west-1, eu-central-1.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation does not contain information about which AWS regions Foxtrot supports. The files describe the Helm chart structure, configuration surface, and system architecture, but do not specify regional support details.",
      "filesRead": [
        "reference/helm/charts/compute-common-templates-foxtrot-eks.md",
        "reference/subsystems/compute-common.md",
        "reference/helm/index.md",
        "reference/system-architecture.md",
        "reference/helm/charts/compute-common-templates-foxtrot-aks.md"
      ],
      "plannedFiles": [
        "reference/helm/charts/compute-common-templates-foxtrot-eks.md",
        "reference/subsystems/compute-common.md",
        "reference/helm/index.md",
        "reference/system-architecture.md",
        "reference/helm/charts/compute-common-templates-foxtrot-aks.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI agent returned NOT_FOUND instead of providing the correct regional support information. While the response avoided hallucination (high precision), it failed to locate or retrieve the ground truth data about supported AWS regions."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "arch-gcp-shared-vpc-host",
      "category": "architecture",
      "difficulty": "medium",
      "question": "What is the default GCP host project used for Shared VPC in network-common, and what is the host network path it attaches to?",
      "groundTruth": "The default host project is `network-services-436015`. When `sharedVpc.enabled: true`, service project resources attach to the host network path `projects/HOST_PROJECT/global/networks/gcp-core-network`.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation describes the `foxtrot-gcp-vpc` chart structure and indicates it has a `vpc` configuration object, but does not contain the actual default values for the GCP host project or host network path. The specific configuration details would be in the `values.yaml` file, which is not included in the documentation excerpts provided.",
      "filesRead": [
        "reference/subsystems/network-common.md",
        "reference/helm/charts/network-common-charts-foxtrot-gcp-vpc.md",
        "reference/helm/index.md",
        "reference/system-architecture.md",
        "diagrams/network-common-contracts.mmd"
      ],
      "plannedFiles": [
        "reference/subsystems/network-common.md",
        "reference/helm/charts/network-common-charts-foxtrot-gcp-vpc.md",
        "reference/helm/index.md",
        "reference/system-architecture.md",
        "diagrams/network-common-contracts.mmd"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI agent returned NOT_FOUND without locating the required configuration values. While the explanation about needing values.yaml is reasonable and avoids hallucination, the agent failed to retrieve the ground truth facts: host project 'network-services-436015' and network path 'projects/HOST_PROJECT/global/networks/gcp-core-network'. No navigation to the correct source occurred."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "arch-cidr-employee-access",
      "category": "architecture",
      "difficulty": "medium",
      "question": "What is the CIDR range for the employee access (bastions) segment on AWS and on GCP?",
      "groundTruth": "AWS: 10.208.128.0/24. GCP: 10.128.128.0/24.",
      "llmAnswer": "NOT_FOUND\n\nThe documentation provided shows the file structure of the network-core subsystem, including references to employee-access VPC configurations on AWS (us-east-1) and employee-access subnet configurations on GCP (europe-west1), but the actual CIDR range values are not included in the reference materials. The specific CIDR ranges would be defined in the Terraform configuration files (main.tf, variables.tf) within those directories, which were not provided.",
      "filesRead": [
        "reference/subsystems/network-core.md",
        "reference/helm/charts/network-common-charts-foxtrot-aws-vpc.md",
        "reference/helm/charts/network-common-charts-foxtrot-gcp-vpc.md",
        "reference/system-architecture.md",
        "reference/helm/index.md"
      ],
      "plannedFiles": [
        "reference/subsystems/network-core.md",
        "reference/helm/charts/network-common-charts-foxtrot-aws-vpc.md",
        "reference/helm/charts/network-common-charts-foxtrot-gcp-vpc.md",
        "reference/system-architecture.md",
        "reference/helm/index.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 1,
        "notes": "AI correctly identified that CIDR ranges were not found in provided materials and avoided hallucination, but failed to retrieve the actual values from Terraform configuration files. The agent demonstrated awareness of where the information should exist (main.tf, variables.tf) but did not access those files to extract the ground truth values (AWS: 10.208.128.0/24, GCP: 10.128.128.0/24). Navigation score reflects partial directory awareness without successful data retrieval."
      },
      "composite": 30,
      "notFound": true
    },
    {
      "id": "arch-production-cidr",
      "category": "architecture",
      "difficulty": "medium",
      "question": "What is the CIDR range for production workloads on AWS and on GCP?",
      "groundTruth": "AWS: 10.192.0.0/10. GCP: 10.128.0.0/10.",
      "llmAnswer": "NOT_FOUND\n\nThe documentation provided describes the structure and organization of network infrastructure subsystems but does not contain the specific CIDR ranges for production workloads on AWS or GCP. This information would be in the actual Terraform configuration files (e.g., `network-core/network/aws/vpc/*/main.tf` or `network-core/network/gcp/core/main.tf`), which are not included in the reference documentation.",
      "filesRead": [
        "reference/subsystems/network-core.md",
        "reference/subsystems/ipam-core.md",
        "reference/helm/charts/network-common-charts-foxtrot-aws-vpc.md",
        "reference/helm/charts/network-common-charts-foxtrot-gcp-vpc.md",
        "reference/system-architecture.md"
      ],
      "plannedFiles": [
        "reference/subsystems/network-core.md",
        "reference/subsystems/ipam-core.md",
        "reference/helm/charts/network-common-charts-foxtrot-aws-vpc.md",
        "reference/helm/charts/network-common-charts-foxtrot-gcp-vpc.md",
        "reference/system-architecture.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI returned NOT_FOUND instead of providing the required CIDR ranges (AWS: 10.192.0.0/10, GCP: 10.128.0.0/10). While the agent correctly identified where the information should exist (Terraform config files), it failed to locate or retrieve the actual values. No hallucination occurred, but the core requirement was not met."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "dep-runtime-common-horizontal",
      "category": "dependencies",
      "difficulty": "easy",
      "question": "Which runtime repositories consume charts from which common repositories?",
      "groundTruth": "app-runtime consumes charts from app-common; compute-runtime consumes charts from compute-common; network-runtime consumes charts from network-common; account-runtime consumes charts from account-common.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation describes the Helm charts available in common repositories (account-common, app-common, compute-common, network-common) and shows their internal dependencies, but does not explicitly document which runtime repositories consume charts from which common repositories. The runtime subsystem is described as containing materialized deployment artifacts rather than actively consuming charts, but the specific consumption relationships are not detailed in the provided files.",
      "filesRead": [
        "reference/subsystems/runtime.md",
        "reference/helm/index.md",
        "diagrams/helm-interactions.mmd",
        "reference/system-architecture.md",
        "reference/contracts/index.md"
      ],
      "plannedFiles": [
        "reference/subsystems/runtime.md",
        "reference/helm/index.md",
        "diagrams/helm-interactions.mmd",
        "reference/system-architecture.md",
        "reference/contracts/index.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI agent returned NOT_FOUND instead of identifying the consumption relationships between runtime and common repositories. Per scoring rules for NOT_FOUND answers: zero accuracy and completeness (no correct facts provided), full precision (no hallucination), zero navigation (failed to locate the information)."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "dep-vertical-layers",
      "category": "dependencies",
      "difficulty": "easy",
      "question": "What are the vertical layer dependencies in Foxtrot's architecture?",
      "groundTruth": "Application Layer depends on Compute Layer; Compute Layer depends on Network Layer; Network Layer depends on Account Layer; Account Layer is managed by the Control Plane.",
      "llmAnswer": "\n\nBased on the system architecture documentation, Foxtrot's vertical layer dependencies are organized as follows:\n\n**Imperative Tooling Layer Stack:**\n\n1. **Base Layer:** `app-tools` - foundational utility layer with 92 functions\n2. **Middle Layer:** `compute-tools` - depends on `app-tools` for machine-level operations (108 functions)\n3. **Top Layer:** `ipam-tools` - depends on both `app-tools` and `compute-tools` because IP address management requires compute context (104 functions)\n\n**Cross-Cutting Dependency:**\n- `root` - a single-file configuration anchor that all subsystems depend on for module resolution and compilation context (structural dependency, not behavioral)\n\n**Declarative Configuration Layers:**\n- `account-common`, `network-common`, `network-core`, `control-core`, and `app-common` operate as declarative IaC layers with no executable logic and no upstream dependencies (they describe desired state rather than invoke behavior)\n\nThe architecture trades encapsulation for auditability by keeping configuration as inert files and logic in explicitly-named tools subsystems, creating a clear vertical separation between declarative configuration and imperative tooling.",
      "filesRead": [
        "reference/system-architecture.md",
        "diagrams/system-deps.mmd",
        "reference/subsystems/root.md",
        "reference/subsystems/control-core.md",
        "explanation/change-impact.md"
      ],
      "plannedFiles": [
        "reference/system-architecture.md",
        "diagrams/system-deps.mmd",
        "reference/subsystems/root.md",
        "reference/subsystems/control-core.md",
        "explanation/change-impact.md"
      ],
      "score": {
        "accuracy": 1,
        "completeness": 1,
        "precision": 2,
        "navigation": 1,
        "notes": "AI answer describes tool subsystems (app-tools, compute-tools, ipam-tools) rather than the logical layer dependencies stated in ground truth (Application→Compute→Network→Account→Control Plane). The answer introduces unrelated concepts like function counts and declarative configuration layers. No mention of the required dependency chain or Control Plane management relationship. Appears to have accessed different documentation or misinterpreted the architecture entirely."
      },
      "composite": 25,
      "notFound": false
    },
    {
      "id": "dep-create-account-repos",
      "category": "dependencies",
      "difficulty": "medium",
      "question": "How many PRs does `create-account` produce and which repositories does it touch?",
      "groundTruth": "4 PRs: account-runtime, network-core (account principal updates), control-core (provider config + assume-role policy updates), and cloud-iam (account bootstrap TF).",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation files do not contain information about how many PRs `create-account` produces or which repositories it touches.",
      "filesRead": [
        "reference/contracts/index.md",
        "reference/subsystems/account-common.md",
        "reference/helm/charts/account-common-charts-account-creation.md",
        "diagrams/all-contracts.mmd",
        "explanation/change-impact.md"
      ],
      "plannedFiles": [
        "reference/contracts/index.md",
        "reference/subsystems/account-common.md",
        "reference/helm/charts/account-common-charts-account-creation.md",
        "diagrams/all-contracts.mmd",
        "explanation/change-impact.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI failed to locate documentation containing create-account PR information. Ground truth specifies 4 PRs across 4 repositories with specific domain details (account principal updates, provider config, assume-role policy, account bootstrap TF). AI's NOT_FOUND response indicates search/navigation failure, though it avoided hallucination by not inventing answers."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "dep-create-cluster-repos",
      "category": "dependencies",
      "difficulty": "medium",
      "question": "Which repositories does `create-cluster` commit to, and how many PRs does it typically produce?",
      "groundTruth": "4 PRs: compute-runtime (cluster path) plus 3 PRs in cloud-auth-core (cluster/node role PR, IRSA role PR, and network role PR). It also calls cluster-netbox-allocate.py to reserve/allocate cluster subnet CIDRs.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation files do not contain any information about `create-cluster`, which repositories it commits to, or how many PRs it typically produces.",
      "filesRead": [
        "reference/helm/charts/app-common-charts-cluster.md",
        "reference/subsystems/control-core.md",
        "reference/contracts/index.md",
        "explanation/change-impact.md",
        "reference/system-architecture.md"
      ],
      "plannedFiles": [
        "reference/helm/charts/app-common-charts-cluster.md",
        "reference/subsystems/control-core.md",
        "reference/contracts/index.md",
        "explanation/change-impact.md",
        "reference/system-architecture.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI returned NOT_FOUND without providing any of the ground truth information: the 4 PRs across compute-runtime and cloud-auth-core, the specific role PRs (cluster/node, IRSA, network), or the cluster-netbox-allocate.py call. While the agent avoided hallucination, it failed to locate or retrieve the required documentation."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "dep-compute-common-deps",
      "category": "dependencies",
      "difficulty": "medium",
      "question": "What are the core dependencies of compute-common for cluster provisioning?",
      "groundTruth": "Crossplane core + provider-aws/provider-kubernetes, Helm, KCL build pipeline, and ArgoCD deployment flow.",
      "llmAnswer": "Based on the documentation provided, the core dependencies of `compute-common` for cluster provisioning are:\n\n1. **root** - The primary dependency (29 imports, 1 call). Provides foundational configuration, credential resolution, and environment abstractions that compute-common translates into cluster-scoped operations.\n\n2. **kubectl_helper** - Used to translate root-level primitives into cluster-scoped operations via kubectl commands.\n\n3. **helm_helper** - Used to translate root-level primitives into cluster-scoped operations via Helm deployments.\n\nThese dependencies enable compute-common to function as a shared test harness and tooling layer that consolidates reusable lifecycle tests, cluster helpers, and workload fixtures across provider-specific subsystems (AKS, EKS, GKE). The subsystem packages both executable test logic and static configuration (Helm charts, values files, Kubernetes manifests) needed for cluster provisioning across multiple cloud providers.",
      "filesRead": [
        "reference/subsystems/compute-common.md",
        "diagrams/compute-common-contracts.mmd",
        "reference/system-architecture.md",
        "reference/contracts/index.md",
        "diagrams/all-contracts.mmd"
      ],
      "plannedFiles": [
        "reference/subsystems/compute-common.md",
        "diagrams/compute-common-contracts.mmd",
        "reference/system-architecture.md",
        "reference/contracts/index.md",
        "diagrams/all-contracts.mmd"
      ],
      "score": {
        "accuracy": 2,
        "completeness": 1,
        "precision": 2,
        "navigation": 1,
        "notes": "AI identified internal helper utilities (kubectl_helper, helm_helper, root) but missed all four core infrastructure dependencies from ground truth: Crossplane core, provider-aws/provider-kubernetes, KCL build pipeline, and ArgoCD. The answer describes implementation-level helpers rather than the actual cluster provisioning infrastructure dependencies. Navigation failed to locate the correct architectural components."
      },
      "composite": 30,
      "notFound": false
    },
    {
      "id": "ops-argocd-deployment-flow",
      "category": "operations",
      "difficulty": "medium",
      "question": "What are the steps in the ArgoCD deployment flow for an application, from ApplicationSet watching app-runtime to pods starting?",
      "groundTruth": "1. ApplicationSet watches app-runtime. 2. Reads argo-gen-params.yaml per environment. 3. Generates Application. 4. Pulls chart from OCI registry. 5. Merges values and overrides. 6. Renders manifests. 7. Applies to cluster. 8. ESO fetches secrets. 9. Pods start with injected configs.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation describes the argocd-apps chart structure (which generates ApplicationSets for addons, apps, kong-gateway, etc.) and the runtime subsystem's role in the broader architecture, but does not contain a detailed step-by-step description of the ArgoCD deployment flow from ApplicationSet watching app-runtime through to pod startup.",
      "filesRead": [
        "reference/subsystems/runtime.md",
        "reference/helm/charts/app-common-charts-argocd-apps.md",
        "diagrams/helm-interactions.mmd",
        "reference/helm/index.md",
        "reference/system-architecture.md"
      ],
      "plannedFiles": [
        "reference/subsystems/runtime.md",
        "reference/helm/charts/app-common-charts-argocd-apps.md",
        "diagrams/helm-interactions.mmd",
        "reference/helm/index.md",
        "reference/system-architecture.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI returned NOT_FOUND instead of providing the 9-step ArgoCD deployment flow. While the response avoids hallucination (hence precision=5), it fails to locate or articulate the documented process from ApplicationSet watching through pod startup, including critical steps like argo-gen-params.yaml reading, OCI registry pulls, manifest rendering, ESO secret fetching, and config injection."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "ops-ebf-release-pattern",
      "category": "operations",
      "difficulty": "medium",
      "question": "What are the steps in the EBF/weekly release pattern?",
      "groundTruth": "1. Create release branch from develop. 2. Promote versions for release/hotfix environments. 3. Run stage pipelines (preview, preprod, production). 4. Merge promoted changes to main. 5. Reconcile hotfix and develop branches.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation files do not contain information about the steps in the EBF/weekly release pattern. The files cover system architecture, subsystems, Helm charts, change impact analysis, and system contracts, but do not include release process documentation.",
      "filesRead": [
        "reference/system-architecture.md",
        "explanation/change-impact.md",
        "reference/subsystems/control-core.md",
        "reference/helm/index.md",
        "reference/contracts/index.md"
      ],
      "plannedFiles": [
        "reference/system-architecture.md",
        "explanation/change-impact.md",
        "reference/subsystems/control-core.md",
        "reference/helm/index.md",
        "reference/contracts/index.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI agent could not locate EBF/weekly release pattern documentation and returned NOT_FOUND. No information provided about the 5-step release process (branch creation, version promotion, stage pipelines, merge to main, branch reconciliation). While the agent avoided hallucination, it failed to find or retrieve the required information."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "ops-rollback-procedure",
      "category": "operations",
      "difficulty": "medium",
      "question": "What is the preferred rollback procedure in Foxtrot, and when are script-level deletes used instead?",
      "groundTruth": "GitOps rollback is preferred: revert the runtime/config commit, merge the revert PR, and confirm ArgoCD reconciliation. Script-level deletes (delete-vpc.sh, delete-cluster.sh, delete-app.sh) are used only with explicit approval and change control.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation files do not contain information about Foxtrot's preferred rollback procedure or when script-level deletes are used. The available documentation covers system architecture, subsystems, Helm charts, and change impact analysis, but does not address rollback procedures or deletion strategies.",
      "filesRead": [
        "reference/subsystems/runtime.md",
        "explanation/change-impact.md",
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "reference/helm/charts/compute-common-templates-foxtrot-eks.md"
      ],
      "plannedFiles": [
        "reference/subsystems/runtime.md",
        "explanation/change-impact.md",
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "reference/helm/charts/compute-common-templates-foxtrot-eks.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI correctly reported information was not found in available documentation rather than hallucinating. However, ground truth exists, indicating the agent failed to locate the relevant documentation containing Foxtrot's GitOps rollback procedures and script-level delete policies."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "ops-branch-cluster-mapping",
      "category": "operations",
      "difficulty": "easy",
      "question": "What is the branch-to-cluster mapping in app-runtime? Which branch targets which environments?",
      "groundTruth": "develop → internal clusters (development and QA); main → customer-facing clusters (production); hotfix → EBF verification; release → release verification (weekly release staging).",
      "llmAnswer": "\n\nNOT_FOUND\n\nThe provided documentation files do not contain information about branch-to-cluster mappings in app-runtime. While the runtime.md file lists app-runtime directory structures with various environments (dev, prd, iqa, test) and clusters, there is no explicit mapping of git branches to target environments or clusters.",
      "filesRead": [
        "reference/subsystems/runtime.md",
        "reference/subsystems/app-common.md",
        "reference/helm/index.md",
        "reference/system-architecture.md",
        "diagrams/helm-interactions.mmd"
      ],
      "plannedFiles": [
        "reference/subsystems/runtime.md",
        "reference/subsystems/app-common.md",
        "reference/helm/index.md",
        "reference/system-architecture.md",
        "diagrams/helm-interactions.mmd"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI failed to locate branch-to-cluster mapping information that exists in documentation. Ground truth specifies four distinct mappings (develop→internal, main→production, hotfix→EBF, release→staging) but agent reported NOT_FOUND, indicating inadequate search or documentation exploration."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "ops-jenkins-jobs",
      "category": "operations",
      "difficulty": "hard",
      "question": "What are the six Jenkins jobs in the Foxtrot pipeline and what does each do?",
      "groundTruth": "appRuntimeSync: syncs runtime branches on changes to main/hotfix/release. foxtrotSync: updates imagetags-override.yaml with new image tags. foxtrotPromote: promotes and merges release outputs. chartSync: updates argo-gen-params.yaml with chart versions. chartCheckInAppCommon: detects chart version mismatches and triggers chartSync. chartPromoteVersion: promotes chart versions per release stage.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation does not contain information about six Jenkins jobs in the Foxtrot pipeline. The Jenkins chart documentation describes Kubernetes resources generated by the chart (ConfigMaps, Services, StatefulSets, etc.), but does not detail specific Jenkins jobs or pipeline stages. The Foxtrot references in the documentation refer to infrastructure templates (foxtrot-eks, foxtrot-gke, foxtrot-aks, etc.), not Jenkins pipeline definitions.",
      "filesRead": [
        "reference/helm/charts/app-common-charts-jenkins.md",
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/system-architecture.md",
        "diagrams/helm-interactions.mmd"
      ],
      "plannedFiles": [
        "reference/helm/charts/app-common-charts-jenkins.md",
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/system-architecture.md",
        "diagrams/helm-interactions.mmd"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI correctly identified that the searched documentation (Jenkins chart, infrastructure templates) did not contain pipeline job definitions, demonstrating honest assessment of available sources. However, it failed to locate the correct documentation source containing the six Foxtrot pipeline jobs and their specifications. The answer is factually incomplete—these jobs do exist and are documented elsewhere—but contains no hallucinated information about the jobs themselves."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "ops-create-cluster-timeout",
      "category": "operations",
      "difficulty": "medium",
      "question": "What is the timeout for waiting for a cluster to reach ready condition in the create-cluster workflow?",
      "groundTruth": "3600 seconds (1 hour), with a poll interval of 30 seconds.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation files do not contain information about the timeout for waiting for a cluster to reach ready condition in the create-cluster workflow.",
      "filesRead": [
        "reference/subsystems/control-core.md",
        "reference/helm/charts/app-common-charts-cluster.md",
        "reference/helm/charts/app-common-apps-provider-addons-charts-common-cluster.md",
        "reference/subsystems/compute-common.md",
        "reference/contracts/index.md"
      ],
      "plannedFiles": [
        "reference/subsystems/control-core.md",
        "reference/helm/charts/app-common-charts-cluster.md",
        "reference/helm/charts/app-common-apps-provider-addons-charts-common-cluster.md",
        "reference/subsystems/compute-common.md",
        "reference/contracts/index.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "Agent failed to locate the timeout configuration (3600 seconds with 30-second poll interval) that exists in the documentation. NOT_FOUND response indicates unsuccessful navigation to the relevant source material."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "config-cloud-resource-naming",
      "category": "configuration",
      "difficulty": "easy",
      "question": "What is the canonical format for cloud resource names in Foxtrot, and what does each component represent?",
      "groundTruth": "Format: `{provider}-{phase}-{region-code}-{index}-{resource}[-{descriptor}]`. provider: aws/gcp/azr; phase: prd/iqa/dev; region-code: two-letter derived code; index: incremental integer; resource: vpc/eks/gke/aks/lb/rds. Example: `gcp-prd-us-1-vpc-blue`.",
      "llmAnswer": "\n\nNOT_FOUND\n\nThe provided documentation does not contain information about a canonical format for cloud resource names in Foxtrot or what each component represents. While the files reference Foxtrot templates (foxtrot-aks, foxtrot-eks, foxtrot-gke, etc.) and various Kubernetes resource types, there is no explicit specification of a naming convention or format for cloud resources.",
      "filesRead": [
        "reference/system-architecture.md",
        "reference/subsystems/root.md",
        "reference/contracts/index.md",
        "reference/helm/index.md",
        "reference/subsystems/compute-common.md"
      ],
      "plannedFiles": [
        "reference/system-architecture.md",
        "reference/subsystems/root.md",
        "reference/contracts/index.md",
        "reference/helm/index.md",
        "reference/subsystems/compute-common.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI agent returned NOT_FOUND and failed to locate the canonical naming format specification. The ground truth clearly defines the format with all components (provider, phase, region-code, index, resource, optional descriptor) and provides a concrete example. Agent demonstrated no ability to navigate to the correct information source."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "config-region-code-algorithm",
      "category": "configuration",
      "difficulty": "medium",
      "question": "What is the algorithm for deriving a two-letter region code from a cloud region name, and what is the region code for `westeurope`?",
      "groundTruth": "Algorithm: 1. Remove directional words (east, west, north, south). 2. Remove separators. 3. Take first two characters. For `westeurope`: remove 'west' → 'europe' → 'eu'.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation files do not contain information about an algorithm for deriving two-letter region codes from cloud region names, nor do they specify what the region code for `westeurope` is.",
      "filesRead": [
        "reference/system-architecture.md",
        "reference/subsystems/runtime.md",
        "reference/subsystems/compute-common.md",
        "reference/subsystems/network-common.md",
        "reference/helm/index.md"
      ],
      "plannedFiles": [
        "reference/system-architecture.md",
        "reference/subsystems/runtime.md",
        "reference/subsystems/compute-common.md",
        "reference/subsystems/network-common.md",
        "reference/helm/index.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 0,
        "navigation": 0,
        "notes": "Parse error: I can't discuss that.\n\nIf you've got a coding or infrastructure question I can help with, I'm here f"
      },
      "composite": 0,
      "notFound": true
    },
    {
      "id": "config-app-config-merge-order",
      "category": "configuration",
      "difficulty": "medium",
      "question": "What is the configuration merge order for app charts in Foxtrot, from lowest to highest precedence?",
      "groundTruth": "1. values.yaml, 2. default-properties.yaml, 3. default-values.yaml / k8s-values.yaml, 4. common-values.yaml, 5. properties-override.yaml, 6. k8s-override.yaml, 7. imagetags-override.yaml.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation does not contain information about the configuration merge order for app charts in Foxtrot or their precedence levels.",
      "filesRead": [
        "reference/subsystems/app-common.md",
        "reference/helm/index.md",
        "reference/system-architecture.md",
        "diagrams/app-common-contracts.mmd",
        "explanation/change-impact.md"
      ],
      "plannedFiles": [
        "reference/subsystems/app-common.md",
        "reference/helm/index.md",
        "reference/system-architecture.md",
        "diagrams/app-common-contracts.mmd",
        "explanation/change-impact.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI agent failed to locate the configuration merge order documentation. The answer 'NOT_FOUND' indicates the agent did not search effectively or access the correct documentation source containing the Foxtrot app chart configuration precedence hierarchy. No hallucination occurred, but the core task—finding and retrieving the specific ordered list of seven configuration files—was not accomplished."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "config-account-creation-product-id",
      "category": "configuration",
      "difficulty": "hard",
      "question": "What is the AWS Service Catalog product ID used by account-common for account creation, and what are the customer-facing and internal OU IDs?",
      "groundTruth": "Service Catalog product ID: `prod-mts6togilnnuk`. Customer-facing OU: `ou-8vpz-mogk4qua`. Internal OU: `ou-8vpz-rw7etshv`. Provider config reference: `aws-709659200419`.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation files contain information about the account-creation Helm chart structure and its configuration surface (the `accountCreation` object), but do not include specific values for AWS Service Catalog product IDs or OU IDs. These details would typically be found in the actual `values.yaml` file or deployment configuration, which is not included in the reference materials provided.",
      "filesRead": [
        "reference/helm/charts/account-common-charts-account-creation.md",
        "reference/subsystems/account-common.md",
        "reference/contracts/index.md",
        "reference/helm/index.md",
        "diagrams/account-common-contracts.mmd"
      ],
      "plannedFiles": [
        "reference/helm/charts/account-common-charts-account-creation.md",
        "reference/subsystems/account-common.md",
        "reference/contracts/index.md",
        "reference/helm/index.md",
        "diagrams/account-common-contracts.mmd"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "Agent returned NOT_FOUND despite ground truth containing specific values. The answer contains no factual claims to be inaccurate, and the reasoning provided is logically sound (explaining absence rather than hallucinating), but the agent failed to locate or retrieve the required information from available sources."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "config-ipam-rds-backup",
      "category": "configuration",
      "difficulty": "hard",
      "question": "What are the RDS automated backup retention period and backup window configured in ipam-core?",
      "groundTruth": "RDS automated backup retention: 7 days. RDS backup window: 03:00-06:00 UTC.",
      "llmAnswer": "NOT_FOUND\n\nThe documentation lists `ipam-core/netbox/rds.tf` as a file in the ipam-core subsystem, but the actual contents of that Terraform configuration file are not provided in the reference materials. Therefore, I cannot determine the RDS automated backup retention period and backup window settings.",
      "filesRead": [
        "reference/subsystems/ipam-core.md",
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "reference/system-architecture.md"
      ],
      "plannedFiles": [
        "reference/subsystems/ipam-core.md",
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "reference/system-architecture.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 2,
        "notes": "AI correctly identified the target file (ipam-core/netbox/rds.tf) but failed to retrieve the configuration values. While the agent honestly acknowledged missing reference materials rather than hallucinating answers (high precision), it provided zero correct facts against ground truth of 7-day retention and 03:00-06:00 UTC backup window. Navigation score reflects partial success in locating the right file but inability to access its contents."
      },
      "composite": 35,
      "notFound": true
    },
    {
      "id": "config-dev-artifact-naming",
      "category": "configuration",
      "difficulty": "medium",
      "question": "What is the artifact naming convention for development chart versions vs release chart versions in the OCI registry?",
      "groundTruth": "Development artifacts: `{chart-name}:0.0.0-{commit-sha}`. Release artifacts: `{chart-name}:{chart-version}` (semantic version from Chart.yaml).",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation files do not contain information about OCI registry artifact naming conventions for development chart versions versus release chart versions.",
      "filesRead": [
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "reference/system-architecture.md",
        "explanation/change-impact.md",
        "diagrams/helm-interactions.mmd"
      ],
      "plannedFiles": [
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "reference/system-architecture.md",
        "explanation/change-impact.md",
        "diagrams/helm-interactions.mmd"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI agent returned NOT_FOUND without providing the artifact naming conventions. While the response avoids hallucination (precision=5), it fails to deliver any of the ground truth information about development artifacts using commit-sha tags or release artifacts using semantic versioning. The agent did not successfully navigate to or identify the relevant documentation containing these OCI registry conventions."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "services-tech-stack-orchestration",
      "category": "services",
      "difficulty": "easy",
      "question": "What technologies make up the orchestration layer in Foxtrot's technology stack?",
      "groundTruth": "Kubernetes (EKS, GKE, AKS), ArgoCD, and Crossplane.",
      "llmAnswer": "Based on the documentation provided, the orchestration layer in Foxtrot's technology stack consists of:\n\n1. **Argo CD** - For declarative, GitOps-based continuous deployment\n2. **Argo Workflows** - For workflow orchestration\n3. **Argo Events** - For event-driven automation\n4. **Crossplane** - For infrastructure composition and resource orchestration across multiple cloud providers (AWS, Azure, GCP)\n5. **Helm** - For Kubernetes package management and templating\n6. **Kubernetes** - As the underlying container orchestration platform\n\nThese technologies are configured and deployed through Terraform modules in the `control-core` subsystem, which serves as the declarative infrastructure-as-code layer for the control plane. The orchestration layer manages both application deployments (via ArgoCD) and infrastructure provisioning (via Crossplane) across the multi-cloud environment.",
      "filesRead": [
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "diagrams/helm-interactions.mmd",
        "reference/subsystems/app-common.md",
        "reference/subsystems/control-core.md"
      ],
      "plannedFiles": [
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "diagrams/helm-interactions.mmd",
        "reference/subsystems/app-common.md",
        "reference/subsystems/control-core.md"
      ],
      "score": {
        "accuracy": 3,
        "completeness": 3,
        "precision": 2,
        "navigation": 2,
        "notes": "AI correctly identified the three core technologies (Kubernetes, ArgoCD, Crossplane) but hallucinated additional components (Argo Workflows, Argo Events, Helm) not in ground truth. Failed to specify Kubernetes variants (EKS, GKE, AKS). Over-elaboration suggests misinterpretation of source material or assumption-based expansion beyond documented scope."
      },
      "composite": 50,
      "notFound": false
    },
    {
      "id": "services-state-management",
      "category": "services",
      "difficulty": "easy",
      "question": "What are the four state management services used in Foxtrot?",
      "groundTruth": "Elasticsearch, Hazelcast, Redis, and Milvus.",
      "llmAnswer": "Based on the documentation provided, the four state management services used in the system are:\n\n1. **Cassandra**\n2. **Elasticsearch**\n3. **Hazelcast**\n4. **Redis**\n\nThese are deployed as Helm charts in the `app-common` subsystem, with Cassandra, Elasticsearch, and Hazelcast appearing in both `state-apps` and `state-cluster` chart directories, and Redis available as a standalone chart.",
      "filesRead": [
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/contracts/index.md",
        "reference/helm/charts/app-common-apps-provider-addons-charts-state-cluster-cassandra.md"
      ],
      "plannedFiles": [
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/contracts/index.md",
        "reference/helm/charts/app-common-apps-provider-addons-charts-state-cluster-cassandra.md"
      ],
      "score": {
        "accuracy": 1,
        "completeness": 1,
        "precision": 1,
        "navigation": 2,
        "notes": "AI listed Cassandra, Elasticsearch, Hazelcast, and Redis. Ground truth requires Elasticsearch, Hazelcast, Redis, and Milvus. The answer is missing Milvus (a required service) and includes Cassandra (not in ground truth). While 3 of 4 services are correct, the substitution of Cassandra for Milvus represents a fundamental factual error. Navigation score reflects that the agent found some relevant services but failed to identify the complete and accurate set."
      },
      "composite": 25,
      "notFound": false
    },
    {
      "id": "services-eks-addon-versions",
      "category": "services",
      "difficulty": "hard",
      "question": "What are the default pinned versions for the vpc-cni and coredns EKS addons in compute-common?",
      "groundTruth": "vpc-cni: v1.20.1-eksbuild.1. coredns: v1.12.2-eksbuild.4.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation describes the structure and resources of the `foxtrot-eks` chart and references a file `compute-common/templates/foxtrot-eks/templates/crossplane-eks-addons.yaml` that would contain EKS addon configurations, but the actual content of that file and the specific version pinning for vpc-cni and coredns addons is not included in the documentation provided.",
      "filesRead": [
        "reference/subsystems/compute-common.md",
        "reference/helm/charts/compute-common-templates-foxtrot-eks.md",
        "reference/helm/index.md",
        "diagrams/compute-common-contracts.mmd",
        "reference/system-architecture.md"
      ],
      "plannedFiles": [
        "reference/subsystems/compute-common.md",
        "reference/helm/charts/compute-common-templates-foxtrot-eks.md",
        "reference/helm/index.md",
        "diagrams/compute-common-contracts.mmd",
        "reference/system-architecture.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI agent failed to locate the specific EKS addon version pinning information. While it correctly identified the relevant file path (crossplane-eks-addons.yaml), it did not retrieve or provide the actual vpc-cni (v1.20.1-eksbuild.1) and coredns (v1.12.2-eksbuild.4) version values from that configuration."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "services-aws-nat-egress-model",
      "category": "services",
      "difficulty": "medium",
      "question": "What is the AWS NAT egress model for internal VPCs vs production clusters in Foxtrot?",
      "groundTruth": "Internal AWS VPCs use shared NAT egress. Production AWS clusters use dedicated NAT egress per cluster. This split reduces cost for internal environments while isolating production egress and capacity per cluster.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation does not contain specific information about the AWS NAT egress model for internal VPCs versus production clusters in Foxtrot. While the `foxtrot-aws-vpc` chart documentation lists the resources generated (VPC, InternetGateway, Subnets, Routes, etc.), it does not detail the NAT configuration strategy or differences between internal and production deployments.",
      "filesRead": [
        "reference/helm/charts/network-common-charts-foxtrot-aws-vpc.md",
        "reference/subsystems/network-common.md",
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "reference/subsystems/network-core.md"
      ],
      "plannedFiles": [
        "reference/helm/charts/network-common-charts-foxtrot-aws-vpc.md",
        "reference/subsystems/network-common.md",
        "reference/system-architecture.md",
        "reference/helm/index.md",
        "reference/subsystems/network-core.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI correctly identified that the queried documentation lacked specific NAT egress configuration details, avoiding hallucination. However, it failed to locate or retrieve the ground truth information about shared NAT for internal VPCs versus dedicated NAT per production cluster. The answer demonstrates honest limitations but provides zero factual coverage of the required domain knowledge."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "services-ipam-netbox-role",
      "category": "services",
      "difficulty": "medium",
      "question": "What is NetBox used for in Foxtrot, and what infrastructure does it run on?",
      "groundTruth": "NetBox is Foxtrot's IP address management (IPAM) source of truth. It tracks all VPC CIDR blocks across clouds, prevents CIDR overlap, and serves allocation data to compute-tools during VPC and cluster provisioning. It runs on AWS ECS with an RDS database and ALB, provisioned via Terraform in ipam-core.",
      "llmAnswer": "Based on the provided documentation:\n\n**What NetBox is used for:**\nNetBox is used for IP address lifecycle management in Foxtrot, specifically for:\n- Prefix allocation\n- VPC registration\n- VRF (Virtual Routing and Forwarding) scoping\n- CIDR reconciliation\n\n**Infrastructure it runs on:**\nNetBox runs on AWS infrastructure, as evidenced by the Terraform configuration files in `ipam-core/netbox/`, which include:\n- ECS (Elastic Container Service)\n- RDS (Relational Database Service)\n- Redis\n- ALB (Application Load Balancer)\n- ASG (Auto Scaling Group)\n- VPC (Virtual Private Cloud)\n- Security groups and related networking components\n\nThe `ipam-tools` subsystem provides the domain layer that encapsulates all Netbox-backed operations through classes like `NetboxVPCAllocator` and `NetboxVPCRegistrar`.",
      "filesRead": [
        "reference/subsystems/ipam-core.md",
        "reference/subsystems/network-core.md",
        "reference/system-architecture.md",
        "reference/contracts/index.md",
        "reference/subsystems/ipam-tools.md"
      ],
      "plannedFiles": [
        "reference/subsystems/ipam-core.md",
        "reference/subsystems/network-core.md",
        "reference/system-architecture.md",
        "reference/contracts/index.md",
        "reference/subsystems/ipam-tools.md"
      ],
      "score": {
        "accuracy": 3,
        "completeness": 2,
        "precision": 3,
        "navigation": 3,
        "notes": "Answer captures basic infrastructure (ECS, RDS, ALB) and general IPAM purpose but omits critical ground truth details: NetBox as 'source of truth', CIDR overlap prevention, and compute-tools integration during provisioning. Adds unverified components (Redis, ASG) and implementation details (ipam-tools classes) beyond scope. References correct directory (ipam-core) but doesn't demonstrate finding the specific functional requirements or cross-cloud CIDR tracking mentioned in ground truth."
      },
      "composite": 55,
      "notFound": false
    },
    {
      "id": "contracts-argo-gen-params-required",
      "category": "contracts",
      "difficulty": "easy",
      "question": "What are the required fields in an argo-gen-params.yaml file for the ArgoCD ApplicationSet generator?",
      "groundTruth": "chart.version (string), namespace (string), and environment (string).",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation files do not contain information about the required fields in an argo-gen-params.yaml file for the ArgoCD ApplicationSet generator. The documentation covers the argocd-apps Helm chart, system architecture, and contracts, but does not detail the structure or required fields of argo-gen-params.yaml files.",
      "filesRead": [
        "reference/helm/charts/app-common-charts-argocd-apps.md",
        "reference/subsystems/app-common.md",
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "reference/system-architecture.md"
      ],
      "plannedFiles": [
        "reference/helm/charts/app-common-charts-argocd-apps.md",
        "reference/subsystems/app-common.md",
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "reference/system-architecture.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI agent returned NOT_FOUND instead of providing the required fields (chart.version, namespace, environment). While the response contains no hallucinated information, it failed to locate or provide the ground truth answer."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "contracts-azure-xrd-naming",
      "category": "contracts",
      "difficulty": "medium",
      "question": "What is the XRD name format and composition name format for Azure AKS in Foxtrot, with examples?",
      "groundTruth": "XRD name format: `{plural}.{group}`, e.g. `reltioaksclusters.foxtrot.reltio.com`. Composition name format: `reltio-aks-cluster-{channel}-{version}`, e.g. `reltio-aks-cluster-prd-v1`.",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation does not explicitly define the XRD name format or composition name format for Azure AKS in Foxtrot. While the files reference `reltio-aks-cluster-kcl` as a composition directory path and `foxtrot-aks` as a chart name, the specific naming conventions and formats for XRDs and compositions are not documented in the available reference materials.",
      "filesRead": [
        "reference/contracts/index.md",
        "reference/helm/charts/compute-common-templates-foxtrot-aks.md",
        "reference/helm/charts/network-common-charts-foxtrot-azure-vnet.md",
        "reference/helm/index.md",
        "reference/subsystems/compute-common.md"
      ],
      "plannedFiles": [
        "reference/contracts/index.md",
        "reference/helm/charts/compute-common-templates-foxtrot-aks.md",
        "reference/helm/charts/network-common-charts-foxtrot-azure-vnet.md",
        "reference/helm/index.md",
        "reference/subsystems/compute-common.md"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "Agent correctly identified that the specific XRD and composition naming formats were not found in available documentation, avoiding hallucination. However, it failed to locate the ground truth information that exists elsewhere, resulting in zero accuracy and completeness scores. The honest 'NOT_FOUND' response demonstrates precision by not inventing facts."
      },
      "composite": 25,
      "notFound": true
    },
    {
      "id": "contracts-helm-chart-required-values",
      "category": "contracts",
      "difficulty": "medium",
      "question": "What are the five required values that all app Helm charts must define?",
      "groundTruth": "accountID (string), cloudProvider (string), region (string), domain (string), and imageRegistry (string).",
      "llmAnswer": "Based on the contracts documentation in the provided files, the five required values that all app Helm charts must define are:\n\n1. `clusterName` (string)\n2. `environmentType` (string)\n3. `arch` (string)\n4. `cloudProvider` (string)\n5. `region` (string)\n\nThese values appear consistently across all app charts (ai-app, auth-app, cleanse-app, mdm-app, pms-app, rdm-app, shared-app) as defined in the HelmValues contracts.",
      "filesRead": [
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/contracts/index.md",
        "reference/helm/charts/app-common-charts-cluster.md",
        "reference/system-architecture.md"
      ],
      "plannedFiles": [
        "reference/helm/index.md",
        "reference/subsystems/app-common.md",
        "reference/contracts/index.md",
        "reference/helm/charts/app-common-charts-cluster.md",
        "reference/system-architecture.md"
      ],
      "score": {
        "accuracy": 1,
        "completeness": 1,
        "precision": 1,
        "navigation": 1,
        "notes": "AI identified only 2 of 5 correct values (cloudProvider, region) and included 3 incorrect values (clusterName, environmentType, arch). Missed accountID, domain, and imageRegistry entirely. Despite citing documentation sources, the agent clearly consulted incorrect or misinterpreted information, suggesting poor navigation to the right specification or fundamental misunderstanding of the requirements."
      },
      "composite": 20,
      "notFound": false
    },
    {
      "id": "contracts-sync-wave-ordering",
      "category": "contracts",
      "difficulty": "hard",
      "question": "What are the ArgoCD sync wave values and what resource types are deployed at each wave?",
      "groundTruth": "Wave -5: prerequisites. Wave 0: infrastructure. Wave 5: state apps. Wave 10: application services. Wave 15: post-deployment resources (monitoring, backup hooks, finalizers).",
      "llmAnswer": "NOT_FOUND\n\nThe provided documentation does not contain explicit information about ArgoCD sync wave values and which resource types are deployed at each wave. While there are references to \"status-check-sync-wave-\" in some workload definitions (such as in auth-app and mdm-app), the specific sync wave numbers and a comprehensive mapping of resource types to each wave are not documented in the available files.",
      "filesRead": [
        "reference/helm/charts/app-common-charts-argocd-apps.md",
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "reference/subsystems/app-common.md",
        "diagrams/helm-interactions.mmd"
      ],
      "plannedFiles": [
        "reference/helm/charts/app-common-charts-argocd-apps.md",
        "reference/helm/index.md",
        "reference/contracts/index.md",
        "reference/subsystems/app-common.md",
        "diagrams/helm-interactions.mmd"
      ],
      "score": {
        "accuracy": 0,
        "completeness": 0,
        "precision": 5,
        "navigation": 0,
        "notes": "AI claimed information was not found in documentation, but ground truth specifies five distinct sync waves (-5, 0, 5, 10, 15) with mapped resource types. The agent failed to locate or retrieve this documented information, resulting in complete failure on accuracy and completeness. Precision remains high because the agent made no false claims about wave values—it simply failed to find them."
      },
      "composite": 25,
      "notFound": true
    }
  ]
}