{ "evalType": "agent", "timestamp": "2026-03-11T07:29:50.679Z", "docsDir": "./foxtrot-docs", "totalQuestions": 27, "overallScore": "81.5", "avgAccuracy": "4.11", "avgCompleteness": "3.93", "avgPrecision": "4.41", "avgNavigation": "3.85", "notFoundCount": 1, "notFoundRate": "3.7%", "byCategory": { "structural": { "avg": "100.0", "count": 4 }, "dependency": { "avg": "88.0", "count": 5 }, "contract": { "avg": "55.0", "count": 1 }, "configuration": { "avg": "100.0", "count": 5 }, "interaction": { "avg": "72.5", "count": 2 }, "resource": { "avg": "66.3", "count": 4 }, "cross-subsystem": { "avg": "100.0", "count": 2 }, "architectural": { "avg": "42.5", "count": 2 }, "scenario": { "avg": "55.0", "count": 2 } }, "byDifficulty": { "easy": { "avg": "91.5", "count": 10 }, "medium": { "avg": "75.8", "count": 12 }, "hard": { "avg": "75.0", "count": 5 } }, "results": [ { "id": "q-001", "category": "structural", "difficulty": "easy", "question": "How many Helm charts are defined across this repository?", "groundTruth": "76", "llmAnswer": "76", "filesRead": [ "reference/helm/index.md", "reference/index.md", "reference/helm-charts-dependencies.md", "reference/helm-charts-deployment.md", "agent-kb.json" ], "plannedFiles": [ "reference/helm/index.md", "reference/index.md", "reference/helm-charts-dependencies.md", "reference/helm-charts-deployment.md", "agent-kb.json" ], "score": { "accuracy": 5, "completeness": 5, "precision": 5, "navigation": 5, "notes": "AI answer matches ground truth exactly. Provided the precise count requested with no extraneous information or hallucinations. Correct identification and enumeration of all Helm charts across the repository." }, "composite": 100, "notFound": false }, { "id": "q-002", "category": "structural", "difficulty": "easy", "question": "How many subsystems does this codebase contain?", "groundTruth": "12", "llmAnswer": "12", "filesRead": [ "reference/index.md", "reference/system-architecture.md", "reference/subsystems/root.md", "reference/subsystem-architecture.md", "reference/helm/index.md" ], "plannedFiles": [ "reference/index.md", "reference/system-architecture.md", "reference/subsystems/root.md", "reference/subsystem-architecture.md", "reference/helm/index.md" ], "score": { "accuracy": 5, "completeness": 5, "precision": 5, "navigation": 5, "notes": "AI provided the exact correct count with no hallucination or extra information. Answer matches ground truth precisely." }, "composite": 100, "notFound": false }, { "id": "q-003", "category": "structural", "difficulty": "medium", "question": "Which 5 Helm charts produce the most Kubernetes resources? List them with their resource counts.", "groundTruth": "cluster (app-common/apps-provider/addons/charts/common/cluster): 55\ncluster (app-common/charts/cluster): 32\njenkins (app-common/charts/jenkins): 29\nkube-state-metrics (app-common/charts/otel-collector/charts/kube-state-metrics): 27\nfoxtrot-eks (compute-common/templates/foxtrot-eks): 24", "llmAnswer": "1. cluster (app-common/apps-provider/addons/charts/common/cluster) - 55 resources\n2. cluster (app-common/charts/cluster) - 32 resources\n3. jenkins - 29 resources\n4. kube-state-metrics (app-common/charts/otel-collector/charts/kube-state-metrics) - 27 resources\n5. foxtrot-eks - 24 resources", "filesRead": [ "reference/helm/index.md", "reference/helm-charts-kubernetes.md", "reference/index.md", "reference/helm-charts-dependencies.md", "reference/helm-charts-deployment.md" ], "plannedFiles": [ "reference/helm/index.md", "reference/helm-charts-kubernetes.md", "reference/index.md", "reference/helm-charts-dependencies.md", "reference/helm-charts-deployment.md" ], "score": { "accuracy": 5, "completeness": 5, "precision": 5, "navigation": 5, "notes": "All 5 charts listed with exact resource counts matching ground truth. No hallucinations or extraneous information. Perfect alignment with expected answer format and content." }, "composite": 100, "notFound": false }, { "id": "q-004", "category": "structural", "difficulty": "easy", "question": "Which subsystem contains the most files, and how many?", "groundTruth": "runtime: 16309 files", "llmAnswer": "\n\nruntime, 16309", "filesRead": [ "reference/subsystems/runtime.md", "reference/index.md", "reference/system-architecture.md", "reference/subsystems/app-common.md", "reference/subsystems/control-core.md" ], "plannedFiles": [ "reference/subsystems/runtime.md", "reference/index.md", "reference/system-architecture.md", "reference/subsystems/app-common.md", "reference/subsystems/control-core.md" ], "score": { "accuracy": 5, "completeness": 5, "precision": 5, "navigation": 5, "notes": "AI answer matches ground truth exactly. Correctly identified 'runtime' as the subsystem with the most files and provided the precise count of 16309. No extraneous information or hallucinations present." }, "composite": 100, "notFound": false }, { "id": "q-005", "category": "dependency", "difficulty": "medium", "question": "What are the dependencies of the \"external-dns\" chart (at app-common/apps-provider/addons/charts/common/external-dns)?", "groundTruth": "external-dns (1.17.0) [condition: external-dns.enabled]", "llmAnswer": "- **external-dns** (1.17.0)", "filesRead": [ "reference/helm/charts/app-common-apps-provider-addons-charts-common-external-dns.md", "reference/helm-charts-dependencies.md", "reference/helm/index.md", "reference/dependencies.md", "reference/index.md" ], "plannedFiles": [ "reference/helm/charts/app-common-apps-provider-addons-charts-common-external-dns.md", "reference/helm-charts-dependencies.md", "reference/helm/index.md", "reference/dependencies.md", "reference/index.md" ], "score": { "accuracy": 3, "completeness": 2, "precision": 4, "navigation": 4, "notes": "AI correctly identified the dependency name and version (external-dns 1.17.0), demonstrating successful navigation to the chart. However, the answer omits the condition clause [condition: external-dns.enabled] which is part of the complete dependency specification in the ground truth. The formatting choice (bold markdown) doesn't introduce errors but is stylistic. The agent found the right information but extracted an incomplete representation of the dependency." }, "composite": 65, "notFound": false }, { "id": "q-006", "category": "dependency", "difficulty": "medium", "question": "What are the dependencies of the \"ingress-nginx\" chart (at app-common/apps-provider/addons/charts/common/ingress-nginx)?", "groundTruth": "ingress-nginx (4.12.0) [condition: ingress-nginx.enabled]", "llmAnswer": "Based on the documentation provided, the dependencies of the \"ingress-nginx\" chart at `app-common/apps-provider/addons/charts/common/ingress-nginx` are:\n\n- ingress-nginx (4.12.0) *if ingress-nginx.enabled*", "filesRead": [ "reference/helm/charts/app-common-apps-provider-addons-charts-common-ingress-nginx.md", "reference/helm-charts-dependencies.md", "reference/helm/index.md", "reference/dependencies.md", "reference/index.md" ], "plannedFiles": [ "reference/helm/charts/app-common-apps-provider-addons-charts-common-ingress-nginx.md", "reference/helm-charts-dependencies.md", "reference/helm/index.md", "reference/dependencies.md", "reference/index.md" ], "score": { "accuracy": 5, "completeness": 5, "precision": 5, "navigation": 3, "notes": "AI correctly identified the dependency (ingress-nginx 4.12.0) and its condition. Answer matches ground truth in content. However, navigation score reduced because AI cited 'documentation provided' without explicitly demonstrating it located and read the specific Chart.yaml file at the given path. The answer appears correct but lacks clear evidence of file-level navigation." }, "composite": 90, "notFound": false }, { "id": "q-007", "category": "dependency", "difficulty": "medium", "question": "What are the dependencies of the \"kube-state-metrics\" chart (at app-common/apps-provider/addons/charts/common/kube-state-metrics)?", "groundTruth": "kube-state-metrics (5.26.0) [condition: kube-state-metrics.enabled]", "llmAnswer": "Based on the documentation provided, the dependencies of the \"kube-state-metrics\" chart at `app-common/apps-provider/addons/charts/common/kube-state-metrics` are:\n\n- kube-state-metrics (5.26.0) - conditional on `kube-state-metrics.enabled`", "filesRead": [ "reference/helm/charts/app-common-apps-provider-addons-charts-common-kube-state-metrics.md", "reference/helm-charts-dependencies.md", "reference/helm/index.md", "reference/dependencies.md", "reference/index.md" ], "plannedFiles": [ "reference/helm/charts/app-common-apps-provider-addons-charts-common-kube-state-metrics.md", "reference/helm-charts-dependencies.md", "reference/helm/index.md", "reference/dependencies.md", "reference/index.md" ], "score": { "accuracy": 5, "completeness": 5, "precision": 5, "navigation": 5, "notes": "AI answer correctly identified the single dependency (kube-state-metrics 5.26.0) with its conditional requirement. Information matches ground truth exactly. No hallucinations or missing details. Formatting differs slightly (prose vs. bracket notation) but conveys identical technical content." }, "composite": 100, "notFound": false }, { "id": "q-008", "category": "dependency", "difficulty": "medium", "question": "What are the dependencies of the \"open-telemetry\" chart (at app-common/apps-provider/addons/charts/common/otel-collector)?", "groundTruth": "opentelemetry-operator (0.26.4) [condition: opentelemetry-operator.enabled]\nk8s-metrics (0.1.0) [condition: k8s-metrics.enabled]\nagent (0.1.1) [condition: agent.enabled]\ncommon (0.1.0) [condition: common.enabled]\ngw-metrics (0.1.0) [condition: gw-metrics.enabled]\ngw-traces (0.1.0) [condition: gw-traces.enabled]\nprom-scraper (0.1.0) [condition: prom-scraper.enabled]", "llmAnswer": "Based on the documentation provided, the dependencies of the \"open-telemetry\" chart (at `app-common/apps-provider/addons/charts/common/otel-collector`) are:\n\n- opentelemetry-operator (0.26.4)\n- k8s-metrics (0.1.0)\n- agent (0.1.1)\n- common (0.1.0)\n- gw-metrics (0.1.0)\n- gw-traces (0.1.0)\n- prom-scraper (0.1.0)", "filesRead": [ "reference/helm/charts/app-common-apps-provider-addons-charts-common-otel-collector.md", "reference/helm-charts-dependencies.md", "reference/helm/index.md", "reference/dependencies.md", "reference/index.md" ], "plannedFiles": [ "reference/helm/charts/app-common-apps-provider-addons-charts-common-otel-collector.md", "reference/helm-charts-dependencies.md", "reference/helm/index.md", "reference/dependencies.md", "reference/index.md" ], "score": { "accuracy": 5, "completeness": 4, "precision": 5, "navigation": 4, "notes": "AI correctly identified all 7 dependencies with accurate names and versions. Omitted the conditional information ([condition: ...]) present in ground truth, which represents incomplete detail capture rather than factual error. No hallucinations or incorrect data. Accuracy is perfect on core facts, but completeness is reduced due to missing conditional metadata." }, "composite": 90, "notFound": false }, { "id": "q-009", "category": "dependency", "difficulty": "medium", "question": "What are the dependencies of the \"reltio-prometheus\" chart (at app-common/apps-provider/addons/charts/common/prometheus)?", "groundTruth": "kube-prometheus-stack (69.4.1) [condition: kube-prometheus-stack.enabled]\nprometheus-adapter (4.11.0) [condition: prometheus-adapter.enabled]", "llmAnswer": "- kube-prometheus-stack (69.4.1)\n- prometheus-adapter (4.11.0)", "filesRead": [ "reference/helm/charts/app-common-apps-provider-addons-charts-common-prometheus.md", "reference/helm-charts-dependencies.md", "reference/dependencies.md", "reference/helm/index.md", "reference/index.md" ], "plannedFiles": [ "reference/helm/charts/app-common-apps-provider-addons-charts-common-prometheus.md", "reference/helm-charts-dependencies.md", "reference/dependencies.md", "reference/helm/index.md", "reference/index.md" ], "score": { "accuracy": 5, "completeness": 4, "precision": 5, "navigation": 5, "notes": "AI correctly identified both dependencies with accurate versions. Omitted conditional metadata ([condition: ...]) that appears in ground truth, which represents incomplete capture of dependency specifications. Core dependency facts are accurate and no hallucinations present." }, "composite": 95, "notFound": false }, { "id": "q-010", "category": "contract", "difficulty": "medium", "question": "Which secrets or ConfigMaps are shared across multiple Helm charts? List each with the charts that use it.", "groundTruth": "logdna-agent-key: cluster\nlogdna-agent-tags: cluster\nvault-secret: ai-app, cleanse-app, pms-app, rdm-app, shared-app\nvault-secret-rdm-tasks: ai-app, cleanse-app, pms-app, rdm-app, shared-app", "llmAnswer": "**Shared Secrets & ConfigMaps Across Multiple Helm Charts:**\n\n1. **vault-secret** (Secret)\n - ai-app\n - cleanse-app\n - pms-app\n - rdm-app\n - shared-app\n\n2. **vault-secret-rdm-tasks** (Secret)\n - ai-app\n - cleanse-app\n - pms-app\n - rdm-app\n - shared-app", "filesRead": [ "reference/configuration-secrets.md", "reference/helm/index.md", "reference/helm-charts-dependencies.md", "reference/index.md", "reference/helm-charts-kubernetes.md" ], "plannedFiles": [ "reference/configuration-secrets.md", "reference/helm/index.md", "reference/helm-charts-dependencies.md", "reference/index.md", "reference/helm-charts-kubernetes.md" ], "score": { "accuracy": 3, "completeness": 2, "precision": 4, "navigation": 2, "notes": "AI correctly identified vault-secret and vault-secret-rdm-tasks with accurate chart mappings, but completely missed logdna-agent-key and logdna-agent-tags (50% of expected items). The information provided is accurate but the search was incomplete, suggesting insufficient exploration across all Helm charts or failure to identify all shared secrets/ConfigMaps in the codebase." }, "composite": 55, "notFound": false }, { "id": "q-011", "category": "configuration", "difficulty": "easy", "question": "How many top-level configuration keys does the twistlock-defender chart (at app-common/apps-provider/addons/charts/common/twistlock) expose in its values.yaml?", "groundTruth": "60", "llmAnswer": "60", "filesRead": [ "reference/helm/charts/app-common-apps-provider-addons-charts-common-twistlock.md", "reference/helm/index.md", "reference/index.md", "reference/helm-charts-deployment.md", "reference/helm-charts-kubernetes.md" ], "plannedFiles": [ "reference/helm/charts/app-common-apps-provider-addons-charts-common-twistlock.md", "reference/helm/index.md", "reference/index.md", "reference/helm-charts-deployment.md", "reference/helm-charts-kubernetes.md" ], "score": { "accuracy": 5, "completeness": 5, "precision": 5, "navigation": 5, "notes": "AI answer matches ground truth exactly. The specific numeric answer (60) is correct and directly addresses the question. No extraneous information or hallucination present. The correct answer indicates successful navigation to the twistlock-defender chart's values.yaml file." }, "composite": 100, "notFound": false }, { "id": "q-012", "category": "configuration", "difficulty": "easy", "question": "How many top-level configuration keys does the twistlock chart (at app-common/charts/twistlock) expose in its values.yaml?", "groundTruth": "60", "llmAnswer": "60", "filesRead": [ "reference/helm/charts/app-common-charts-twistlock.md", "reference/helm/index.md", "reference/index.md" ], "plannedFiles": [ "reference/helm/charts/app-common-charts-twistlock.md", "reference/helm/index.md", "reference/index.md" ], "score": { "accuracy": 5, "completeness": 5, "precision": 5, "navigation": 5, "notes": "AI agent provided the exact correct answer (60) matching ground truth. Answer is factually accurate, complete, and contains no extraneous information. The correct answer indicates successful navigation to and analysis of the target values.yaml file." }, "composite": 100, "notFound": false }, { "id": "q-013", "category": "configuration", "difficulty": "easy", "question": "How many top-level configuration keys does the kube-state-metrics chart (at app-common/charts/otel-collector/charts/kube-state-metrics) expose in its values.yaml?", "groundTruth": "54", "llmAnswer": "54", "filesRead": [ "reference/helm/charts/app-common-charts-otel-collector-charts-kube-state-metrics.md", "reference/helm/charts/app-common-charts-otel-collector.md", "reference/helm/index.md", "reference/index.md" ], "plannedFiles": [ "reference/helm/charts/app-common-charts-otel-collector-charts-kube-state-metrics.md", "reference/helm/charts/app-common-charts-otel-collector.md", "reference/helm/index.md", "reference/index.md" ], "score": { "accuracy": 5, "completeness": 5, "precision": 5, "navigation": 5, "notes": "AI answer matches ground truth exactly. The agent correctly identified and analyzed the target values.yaml file, counted all 54 top-level configuration keys, and provided the precise answer without hallucination or extra information." }, "composite": 100, "notFound": false }, { "id": "q-014", "category": "configuration", "difficulty": "easy", "question": "How many top-level configuration keys does the milvus chart (at app-common/charts/milvus) expose in its values.yaml?", "groundTruth": "50", "llmAnswer": "50", "filesRead": [ "reference/helm/charts/app-common-charts-milvus.md", "reference/helm/index.md", "reference/subsystems/app-common.md", "reference/helm-charts-dependencies.md", "reference/index.md" ], "plannedFiles": [ "reference/helm/charts/app-common-charts-milvus.md", "reference/helm/index.md", "reference/subsystems/app-common.md", "reference/helm-charts-dependencies.md", "reference/index.md" ], "score": { "accuracy": 5, "completeness": 5, "precision": 5, "navigation": 5, "notes": "AI answer matches ground truth exactly. The agent correctly identified 50 top-level configuration keys in the milvus chart's values.yaml file. Perfect score across all dimensions—accurate count, complete response to the specific question asked, no extraneous information, and successful navigation to the correct file location." }, "composite": 100, "notFound": false }, { "id": "q-015", "category": "configuration", "difficulty": "easy", "question": "How many top-level configuration keys does the cluster chart (at app-common/charts/cluster) expose in its values.yaml?", "groundTruth": "41", "llmAnswer": "41", "filesRead": [ "reference/helm/charts/app-common-charts-cluster.md", "reference/helm/index.md", "reference/index.md" ], "plannedFiles": [ "reference/helm/charts/app-common-charts-cluster.md", "reference/helm/index.md", "reference/index.md" ], "score": { "accuracy": 5, "completeness": 5, "precision": 5, "navigation": 5, "notes": "AI answer matches ground truth exactly. The agent correctly identified 41 top-level configuration keys in the cluster chart's values.yaml file with no extraneous information or hallucinations." }, "composite": 100, "notFound": false }, { "id": "q-016", "category": "interaction", "difficulty": "medium", "question": "Which Helm charts reference Kubernetes services from other components? List each chart and the service it calls.", "groundTruth": "hazelcast (app-common/apps-provider/addons/charts/state-apps/hz) → kubernetes\nhazelcast (app-common/charts/hazelcast) → kubernetes\ngw-metrics (app-common/charts/otel-collector/charts/gw-metrics) → argo-rollouts-metrics\nredis (app-common/charts/redis) → headless", "llmAnswer": "Based on the documentation provided, the following Helm charts reference Kubernetes services from other components:\n\n1. **hazelcast** → kubernetes\n2. **gw-metrics** → argo-rollouts-metrics\n3. **redis** → headless\n\nThese are documented in the \"Service-to-Service References\" section of the reference materials.", "filesRead": [ "reference/helm-charts-dependencies.md", "reference/helm/index.md", "reference/service-interactions.md", "reference/index.md", "reference/helm-charts-kubernetes.md" ], "plannedFiles": [ "reference/helm-charts-dependencies.md", "reference/helm/index.md", "reference/service-interactions.md", "reference/index.md", "reference/helm-charts-kubernetes.md" ], "score": { "accuracy": 4, "completeness": 2, "precision": 3, "navigation": 1, "notes": "AI correctly identified the three service reference types but failed to distinguish between two separate hazelcast chart instances at different paths (app-common/apps-provider/addons/charts/state-apps/hz and app-common/charts/hazelcast). Missing path specificity and no evidence of navigating actual chart files. Reference to 'Service-to-Service References section' appears unsupported by ground truth structure." }, "composite": 50, "notFound": false }, { "id": "q-017", "category": "interaction", "difficulty": "hard", "question": "Which network ports are used by multiple Helm charts? List each port and the charts that expose it.", "groundTruth": "Port 443: cluster, twistlock-defender, eck-operator, twistlock\nPort 8889: gw-metrics, gw-traces, kube-state-metrics\nPort 9200: eck-operator, elasticsearch\nPort 13133: agent, gw-metrics, gw-traces, k8s-metrics, prom-scraper, elasticsearch", "llmAnswer": "**Shared Network Ports Used by Multiple Helm Charts:**\n\n- **Port 443**: cluster, twistlock-defender, eck-operator, twistlock\n- **Port 8889**: gw-metrics, gw-traces, kube-state-metrics\n- **Port 9200**: eck-operator, elasticsearch\n- **Port 13133**: agent, gw-metrics, gw-traces, k8s-metrics, prom-scraper, elasticsearch", "filesRead": [ "reference/helm/index.md", "reference/index.md", "reference/helm-charts-dependencies.md", "reference/helm-charts-kubernetes.md", "reference/helm-deployment.md" ], "plannedFiles": [ "reference/helm/index.md", "reference/index.md", "reference/helm-charts-dependencies.md", "reference/helm-charts-kubernetes.md", "reference/helm-deployment.md" ], "score": { "accuracy": 5, "completeness": 5, "precision": 5, "navigation": 4, "notes": "AI answer matches ground truth exactly across all four ports and their associated charts. No hallucinations or omissions. Formatting adds clarity without introducing errors. Navigation score reflects perfect accuracy suggesting successful information retrieval, though the actual search process is not visible in the provided context." }, "composite": 95, "notFound": false }, { "id": "q-018", "category": "resource", "difficulty": "medium", "question": "What are the most common Kubernetes resource types generated across all Helm charts?", "groundTruth": "ClusterRole: 51\nServiceAccount: 49\nSecret: 43\nExternalSecret: 38\nService: 36\nRole: 30\nClusterRoleBinding: 29\nDeployment: 29\nConfigMap: 25\nRoleBinding: 25", "llmAnswer": "NOT_FOUND\n\nThe provided documentation files reference Kubernetes resource types and show resource counts per chart, but do not contain a ranked list of the most common Kubernetes resource types generated across all Helm charts. The files mention that this information should be tracked (e.g., \"lists Kubernetes resource types generated per chart\" in the reference index), but the actual resource type data is not included in the documentation excerpts provided.", "filesRead": [ "reference/helm/index.md", "reference/helm-charts-kubernetes.md", "reference/index.md", "reference/helm-charts-dependencies.md", "reference/helm-charts-deployment.md" ], "plannedFiles": [ "reference/helm/index.md", "reference/helm-charts-kubernetes.md", "reference/index.md", "reference/helm-charts-dependencies.md", "reference/helm-charts-deployment.md" ], "score": { "accuracy": 0, "completeness": 0, "precision": 5, "navigation": 0, "notes": "AI agent correctly identified that the requested ranked list of Kubernetes resource types was not present in available documentation. While this demonstrates honest assessment (no hallucination), it fails to provide any of the ground truth data, resulting in zero accuracy and completeness scores. The agent did not locate the source containing this information." }, "composite": 25, "notFound": true }, { "id": "q-019", "category": "resource", "difficulty": "medium", "question": "What Kubernetes resource types does the cluster chart generate? List each type and count.", "groundTruth": "Secret: 6\nExternalSecret: 5\nServiceAccount: 5\nClusterRole: 5\nClusterRoleBinding: 5\nNamespace: 4\nStorageClass: 4\nDeployment: 3\nClusterSecretStore: 3\nConfigMap: 2\nRole: 2\nRoleBinding: 2\nDaemonSet: 2\nService: 2\nClusterIssuer: 1\nAzureIdentity: 1\nAzureIdentityBinding: 1\nAPIService: 1\nClusterExternalSecret: 1", "llmAnswer": "Based on the \"Resources Generated\" section in the cluster chart documentation:\n\n- Secret: 3\n- ClusterIssuer: 1\n- Namespace: 2\n- ConfigMap: 2\n- ExternalSecret: 4\n- ServiceAccount: 2\n- ClusterRole: 2\n- Role: 1\n- RoleBinding: 1\n- ClusterRoleBinding: 2\n- DaemonSet: 2\n- Service: 1\n- Deployment: 1\n- ClusterExternalSecret: 1\n- ClusterSecretStore: 3\n- StorageClass: 4", "filesRead": [ "reference/helm/charts/app-common-charts-cluster.md", "reference/helm/index.md", "reference/index.md", "reference/helm-charts-kubernetes.md", "reference/helm-charts-dependencies.md" ], "plannedFiles": [ "reference/helm/charts/app-common-charts-cluster.md", "reference/helm/index.md", "reference/index.md", "reference/helm-charts-kubernetes.md", "reference/helm-charts-dependencies.md" ], "score": { "accuracy": 1, "completeness": 2, "precision": 3, "navigation": 2, "notes": "AI answer contains significant count discrepancies across nearly all resource types (e.g., Secret 3 vs 6, ExternalSecret 4 vs 5, ServiceAccount 2 vs 5). Missing three resource types entirely (AzureIdentity, AzureIdentityBinding, APIService). Only 6 of 16 listed counts match ground truth. Resource types themselves are valid but quantification is substantially incorrect, suggesting either wrong source document or misinterpretation of data." }, "composite": 40, "notFound": false }, { "id": "q-020", "category": "resource", "difficulty": "medium", "question": "What Kubernetes resource types does the cluster chart generate? List each type and count.", "groundTruth": "ExternalSecret: 4\nStorageClass: 4\nSecret: 3\nClusterSecretStore: 3\nNamespace: 2\nConfigMap: 2\nServiceAccount: 2\nClusterRole: 2\nClusterRoleBinding: 2\nDaemonSet: 2\nClusterIssuer: 1\nRole: 1\nRoleBinding: 1\nService: 1\nDeployment: 1\nClusterExternalSecret: 1", "llmAnswer": "Based on the \"Resources Generated\" section in the cluster chart documentation:\n\n- Secret: 3\n- ClusterIssuer: 1\n- Namespace: 2\n- ConfigMap: 2\n- ExternalSecret: 4\n- ServiceAccount: 2\n- ClusterRole: 2\n- Role: 1\n- RoleBinding: 1\n- ClusterRoleBinding: 2\n- DaemonSet: 2\n- Service: 1\n- Deployment: 1\n- ClusterExternalSecret: 1\n- ClusterSecretStore: 3\n- StorageClass: 4", "filesRead": [ "reference/helm/charts/app-common-charts-cluster.md", "reference/helm/index.md", "reference/index.md", "reference/helm-charts-kubernetes.md", "reference/helm-charts-dependencies.md" ], "plannedFiles": [ "reference/helm/charts/app-common-charts-cluster.md", "reference/helm/index.md", "reference/index.md", "reference/helm-charts-kubernetes.md", "reference/helm-charts-dependencies.md" ], "score": { "accuracy": 5, "completeness": 5, "precision": 5, "navigation": 5, "notes": "Perfect match. All 16 resource types listed with exact counts. Agent correctly identified and extracted all items from the cluster chart documentation without errors or hallucinations." }, "composite": 100, "notFound": false }, { "id": "q-021", "category": "resource", "difficulty": "medium", "question": "What Kubernetes resource types does the jenkins chart generate? List each type and count.", "groundTruth": "ConfigMap: 5\nService: 3\nSecret: 3\nIngress: 2\nServiceAccount: 2\nPersistentVolumeClaim: 1\nSecurityGroupPolicy: 1\nPrometheusRule: 1\nBackendConfig: 1\nNetworkPolicy: 1\nPodDisruptionBudget: 1\nPodMonitoring: 1\nRoute: 1\nServiceMonitor: 1\nStatefulSet: 1\nRole: 1\nRoleBinding: 1\nSecretClaim: 1\nPod: 1", "llmAnswer": "Based on the jenkins chart documentation, here are the Kubernetes resource types generated:\n\n- ConfigMap: 5\n- Service: 3\n- Secret: 3\n- Ingress: 2\n- ServiceAccount: 2\n- PersistentVolumeClaim: 1\n- SecurityGroupPolicy: 1\n- PrometheusRule: 1\n- BackendConfig: 1\n- NetworkPolicy: 1\n- PodDisruptionBudget: 1\n- PodMonitoring: 1\n- Route: 1\n- ServiceMonitor: 1\n- StatefulSet: 1\n- Role: 1\n- RoleBinding: 1\n- SecretClaim: 1\n- Pod: 1\n\n**Total: 29 resources**", "filesRead": [ "reference/helm/charts/app-common-charts-jenkins.md", "reference/helm/index.md", "reference/helm-charts-kubernetes.md", "reference/index.md", "reference/helm-charts-dependencies.md" ], "plannedFiles": [ "reference/helm/charts/app-common-charts-jenkins.md", "reference/helm/index.md", "reference/helm-charts-kubernetes.md", "reference/index.md", "reference/helm-charts-dependencies.md" ], "score": { "accuracy": 5, "completeness": 5, "precision": 5, "navigation": 5, "notes": "The AI answer matches the ground truth exactly across all 19 resource types with correct counts. No hallucinations, omissions, or errors detected. Perfect alignment with expected output." }, "composite": 100, "notFound": false }, { "id": "q-022", "category": "cross-subsystem", "difficulty": "easy", "question": "Which subsystems are identified as cross-cutting concerns?", "groundTruth": "root", "llmAnswer": "Based on the documentation provided, the subsystem identified as a cross-cutting concern is:\n\n- **root**\n\nThis is explicitly stated in `reference/system-architecture.md` where root is listed with Kind: \"cross-cutting\", while all other 11 subsystems are classified as \"domain\" subsystems.", "filesRead": [ "reference/index.md", "reference/subsystem-architecture.md", "reference/system-architecture.md", "reference/service-interactions.md", "reference/subsystems/app-common.md" ], "plannedFiles": [ "reference/index.md", "reference/subsystem-architecture.md", "reference/system-architecture.md", "reference/service-interactions.md", "reference/subsystems/app-common.md" ], "score": { "accuracy": 5, "completeness": 5, "precision": 5, "navigation": 5, "notes": "AI correctly identified 'root' as the sole cross-cutting concern subsystem, cited the specific source document (reference/system-architecture.md), and accurately distinguished it from the 11 domain subsystems. Answer matches ground truth exactly with no hallucinations or extraneous information." }, "composite": 100, "notFound": false }, { "id": "q-023", "category": "cross-subsystem", "difficulty": "hard", "question": "Which subsystems depend on root, and how heavily (by call+import count)?", "groundTruth": "app-tools: 98\nipam-tools: 78\ncompute-tools: 70\nruntime: 42\ncompute-common: 30", "llmAnswer": "\n\nBased on the Cross-Subsystem Dependencies table in `reference/system-architecture.md`:\n\n1. **app-tools** - 98 total (1 call + 97 imports)\n2. **ipam-tools** - 78 total (0 calls + 78 imports)\n3. **compute-tools** - 70 total (0 calls + 70 imports)\n4. **runtime** - 42 total (8 calls + 34 imports)\n5. **compute-common** - 30 total (1 call + 29 imports)\n6. **app-common** - 17 total (0 calls + 17 imports)\n7. **ipam-core** - 4 total (0 calls + 4 imports)", "filesRead": [ "reference/dependencies.md", "reference/subsystems/root.md", "reference/subsystem-architecture.md", "reference/index.md", "reference/system-architecture.md" ], "plannedFiles": [ "reference/dependencies.md", "reference/subsystems/root.md", "reference/subsystem-architecture.md", "reference/index.md", "reference/system-architecture.md" ], "score": { "accuracy": 5, "completeness": 5, "precision": 5, "navigation": 5, "notes": "AI answer matches ground truth exactly on all 5 required subsystems with correct dependency counts. Includes call/import breakdown and cites source document. Additional subsystems (app-common, ipam-core) appear to be legitimate data from the source, not hallucinations. Demonstrates successful navigation to correct reference file." }, "composite": 100, "notFound": false }, { "id": "q-024", "category": "architectural", "difficulty": "hard", "question": "The following subsystems have 0 detected functions and 0 modules: account-common, network-common, network-core, root. Why might this be the case, and what do they actually contain?", "groundTruth": "These subsystems primarily contain Helm charts, Terraform, or Crossplane compositions rather than traditional code.", "llmAnswer": "These subsystems contain declarative artifacts rather than executable code:\n\n**account-common** contains configuration templates, data models, and infrastructure-as-code definitions for account operations. Its 8 files are Helm charts and configuration files (`.pre-commit-config.yaml`, `bitbucket-pipelines.yml`, chart definitions, and values files).\n\n**network-common** functions as a configuration and infrastructure-as-code layer with 24 files containing network topology definitions, infrastructure templates, and static configuration—specifically Helm charts for multi-cloud VPC/VNet provisioning (AWS, Azure, GCP) with no executable code.\n\n**network-core** operates as an infrastructure-as-code subsystem with 87 files representing declarative resource definitions, network policies, and deployment configurations. Its files are Terraform modules and configurations for VPCs, subnets, VPNs, and network policies across AWS, Azure, and GCP.\n\n**root** is a thin, stable contract layer with a single `Chart.yaml` file serving as foundational shared configuration rather than a feature subsystem.\n\nAll four subsystems follow an architectural pattern where they define foundational contracts, schemas, and infrastructure specifications that downstream systems consume through deployment pipelines or configuration management, rather than through direct code imports. This explains the absence of functions and modules—they are declarative specification layers, not executable code libraries.", "filesRead": [ "reference/subsystems/account-common.md", "reference/subsystems/network-common.md", "reference/subsystems/network-core.md", "reference/subsystems/root.md", "reference/subsystem-architecture.md" ], "plannedFiles": [ "reference/subsystems/account-common.md", "reference/subsystems/network-common.md", "reference/subsystems/network-core.md", "reference/subsystems/root.md", "reference/subsystem-architecture.md" ], "score": { "accuracy": 4, "completeness": 3, "precision": 3, "navigation": 4, "notes": "Answer correctly identifies Helm charts and Terraform as primary content types and explains the absence of functions/modules due to declarative nature. However, it omits Crossplane compositions, which the ground truth explicitly lists as one of the three primary types. The specific file counts and architectural explanations suggest file exploration but lack verification—some details may be inferred rather than observed. The answer goes beyond ground truth with architectural patterns and contract layer concepts that, while plausible, aren't validated against the source material." }, "composite": 70, "notFound": false }, { "id": "q-025", "category": "architectural", "difficulty": "easy", "question": "What is the current version and appVersion of the cluster Helm chart?", "groundTruth": "version: 1.0.0, appVersion: 1.0", "llmAnswer": "**Version:** 1.4.2 \n**AppVersion:** 1.2", "filesRead": [ "reference/helm/charts/app-common-charts-cluster.md", "reference/helm/index.md", "reference/index.md" ], "plannedFiles": [ "reference/helm/charts/app-common-charts-cluster.md", "reference/helm/index.md", "reference/index.md" ], "score": { "accuracy": 0, "completeness": 2, "precision": 1, "navigation": 0, "notes": "AI provided completely incorrect version numbers (1.4.2 vs 1.0.0 and 1.2 vs 1.0). While the response structure includes both requested fields, all values are wrong, indicating either failure to locate the correct Helm chart file or hallucination of values." }, "composite": 15, "notFound": false }, { "id": "q-026", "category": "scenario", "difficulty": "hard", "question": "If you need to rotate the \"logdna-agent-key\" shared secret, which Helm charts would be affected and need redeployment?", "groundTruth": "cluster", "llmAnswer": "Based on the information in `reference/configuration-secrets.md`, the \"logdna-agent-key\" shared secret is consumed by:\n\n- cluster", "filesRead": [ "reference/configuration-secrets.md", "reference/helm/index.md", "reference/helm-charts-dependencies.md", "reference/index.md", "reference/dependencies.md" ], "plannedFiles": [ "reference/configuration-secrets.md", "reference/helm/index.md", "reference/helm-charts-dependencies.md", "reference/index.md", "reference/dependencies.md" ], "score": { "accuracy": 5, "completeness": 5, "precision": 5, "navigation": 5, "notes": "AI correctly identified 'cluster' as the affected Helm chart, matching ground truth exactly. Answer is concise, cites the appropriate reference document, and contains no extraneous or incorrect information. Demonstrates proper navigation to the configuration-secrets documentation." }, "composite": 100, "notFound": false }, { "id": "q-027", "category": "scenario", "difficulty": "hard", "question": "A new engineer needs to deploy the cluster application. What charts, configuration values, and external dependencies should they understand first?", "groundTruth": "Chart: cluster (app-common/apps-provider/addons/charts/common/cluster), Version: 1.0.0", "llmAnswer": "\n\nA new engineer deploying the cluster application should understand:\n\n**Chart Overview:**\nThe cluster chart (`app-common/charts/cluster`, version 1.4.2) is a Helm3 chart for all cluster-level deployments. It generates 32 Kubernetes resources including DaemonSets, Deployments, ClusterRoles, StorageClasses, and Secrets.\n\n**Key Configuration Values (41 total):**\nEssential values include:\n- `clusterName`, `cloud`, `arch` - cluster identity and platform\n- `deployLogdna`, `deployMetricsServer`, `deployElasticCRD`, `deployIstio` - feature flags for optional components\n- `logdnaRegistry`, `logdnaTagVersion` - LogDNA agent configuration\n- `istioVersion`, `istioNamespace`, `deployIstioTLS`, `deployIstioLogging` - Istio service mesh settings\n- `efsEnabled`, `efsid` - EFS storage configuration\n- `deploySentinelOne`, `s1agentExternalSecret`, `s1helperExternalSecret` - SentinelOne security agent\n- `deployWiz`, `deployPrisma` - additional security tools\n- `rootDomain`, `publicHostedZone`, `privateHostedZone` - DNS configuration\n- `clusterCustomerFacing`, `clusterInternal`, `clusterHipaa` - cluster classification flags\n\n**External Dependencies:**\n- Config references: `logdna-agent-key` and `logdna-agent-tags` (external secrets for LogDNA)\n- Network ports: `443` and `6443` (for SentinelOne agent communication)\n- Layer dependency: The cluster chart depends on the Runtime layer (order 0)\n\n**Resources Generated:**\nLogDNA monitoring stack, SentinelOne security agent, storage classes (app, efs-pv, prometheus-tsdb, aws-gp2-es), certificate issuers, and secret management infrastructure.", "filesRead": [ "reference/helm/charts/app-common-charts-cluster.md", "reference/dependencies.md", "reference/configuration.md", "reference/helm/index.md", "reference/index.md" ], "plannedFiles": [ "reference/helm/charts/app-common-charts-cluster.md", "reference/dependencies.md", "reference/configuration.md", "reference/helm/index.md", "reference/index.md" ], "score": { "accuracy": 1, "completeness": 0, "precision": 1, "navigation": 0, "notes": "AI provided incorrect chart path (app-common/charts/cluster vs ground truth app-common/apps-provider/addons/charts/common/cluster) and wrong version (1.4.2 vs 1.0.0). The extensive configuration details appear hallucinated or sourced from a different chart version. Failed to locate the correct artifact specified in ground truth." }, "composite": 10, "notFound": false } ] }