693 lines
38 KiB
JSON
693 lines
38 KiB
JSON
|
|
{
|
||
|
|
"evalType": "agent",
|
||
|
|
"timestamp": "2026-03-10T04:51:16.794Z",
|
||
|
|
"docsDir": "./foxtrot-docs-v3",
|
||
|
|
"totalQuestions": 22,
|
||
|
|
"overallScore": "59.3",
|
||
|
|
"avgAccuracy": "2.59",
|
||
|
|
"avgCompleteness": "2.41",
|
||
|
|
"avgPrecision": "4.64",
|
||
|
|
"avgNavigation": "2.23",
|
||
|
|
"notFoundCount": 8,
|
||
|
|
"notFoundRate": "36.4%",
|
||
|
|
"byCategory": {
|
||
|
|
"structural": {
|
||
|
|
"avg": "50.0",
|
||
|
|
"count": 3
|
||
|
|
},
|
||
|
|
"dependency": {
|
||
|
|
"avg": "61.0",
|
||
|
|
"count": 5
|
||
|
|
},
|
||
|
|
"contract": {
|
||
|
|
"avg": "25.0",
|
||
|
|
"count": 1
|
||
|
|
},
|
||
|
|
"configuration": {
|
||
|
|
"avg": "95.0",
|
||
|
|
"count": 5
|
||
|
|
},
|
||
|
|
"interaction": {
|
||
|
|
"avg": "25.0",
|
||
|
|
"count": 2
|
||
|
|
},
|
||
|
|
"resource": {
|
||
|
|
"avg": "25.0",
|
||
|
|
"count": 4
|
||
|
|
},
|
||
|
|
"cross-subsystem": {
|
||
|
|
"avg": "100.0",
|
||
|
|
"count": 1
|
||
|
|
},
|
||
|
|
"architectural": {
|
||
|
|
"avg": "100.0",
|
||
|
|
"count": 1
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"byDifficulty": {
|
||
|
|
"easy": {
|
||
|
|
"avg": "87.5",
|
||
|
|
"count": 8
|
||
|
|
},
|
||
|
|
"medium": {
|
||
|
|
"avg": "40.0",
|
||
|
|
"count": 12
|
||
|
|
},
|
||
|
|
"hard": {
|
||
|
|
"avg": "62.5",
|
||
|
|
"count": 2
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"results": [
|
||
|
|
{
|
||
|
|
"id": "struct-001",
|
||
|
|
"category": "structural",
|
||
|
|
"difficulty": "easy",
|
||
|
|
"question": "How many Helm charts are defined across the Foxtrot monorepo?",
|
||
|
|
"groundTruth": "76",
|
||
|
|
"llmAnswer": "\n\n76",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/helm/charts/app-common-charts-jenkins.md",
|
||
|
|
"reference/helm/charts/app-common-charts-milvus.md",
|
||
|
|
"reference/helm/charts/app-common-charts-otel-collector.md",
|
||
|
|
"reference/helm/charts/app-common-charts-hazelcast.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/helm/charts/app-common-charts-jenkins.md",
|
||
|
|
"reference/helm/charts/app-common-charts-milvus.md",
|
||
|
|
"reference/helm/charts/app-common-charts-otel-collector.md",
|
||
|
|
"reference/helm/charts/app-common-charts-hazelcast.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 5,
|
||
|
|
"completeness": 5,
|
||
|
|
"precision": 5,
|
||
|
|
"navigation": 2,
|
||
|
|
"notes": "AI provided the exact correct answer (76), but with zero explanation or demonstration of how it found this information. No working shown, no file paths referenced, no methodology explained. While the accuracy is perfect, the complete lack of transparency into the navigation process makes it impossible to verify the agent actually located and analyzed the relevant information versus providing a memorized or lucky answer."
|
||
|
|
},
|
||
|
|
"composite": 85,
|
||
|
|
"notFound": false
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "struct-002",
|
||
|
|
"category": "structural",
|
||
|
|
"difficulty": "medium",
|
||
|
|
"question": "Which 5 Helm charts produce the most Kubernetes resources? List them with their resource counts.",
|
||
|
|
"groundTruth": "cluster (app-common/apps-provider/addons/charts/common/cluster): 55\ncluster (app-common/charts/cluster): 32\njenkins (app-common/charts/jenkins): 29\nkube-state-metrics (app-common/charts/otel-collector/charts/kube-state-metrics): 27\nfoxtrot-eks (compute-common/templates/foxtrot-eks): 24",
|
||
|
|
"llmAnswer": "\n\nNOT_FOUND\n\nThe provided documentation contains Helm chart configuration values and descriptions, but does not include explicit counts of Kubernetes resources produced by each chart. To accurately determine which charts produce the most resources would require analyzing the actual chart templates, which are not included in the reference materials provided.",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/helm/charts/app-common-charts-milvus.md",
|
||
|
|
"reference/helm/charts/app-common-charts-jenkins.md",
|
||
|
|
"reference/helm/charts/app-common-charts-otel-collector.md",
|
||
|
|
"reference/helm/charts/app-common-charts-loqate-app.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/helm/charts/app-common-charts-milvus.md",
|
||
|
|
"reference/helm/charts/app-common-charts-jenkins.md",
|
||
|
|
"reference/helm/charts/app-common-charts-otel-collector.md",
|
||
|
|
"reference/helm/charts/app-common-charts-loqate-app.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 0,
|
||
|
|
"completeness": 0,
|
||
|
|
"precision": 5,
|
||
|
|
"navigation": 0,
|
||
|
|
"notes": "AI agent failed to locate or retrieve the resource count data for Helm charts. Provided no factual information from ground truth, but avoided hallucination by explicitly stating inability to find the data rather than inventing numbers."
|
||
|
|
},
|
||
|
|
"composite": 25,
|
||
|
|
"notFound": true
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "struct-003",
|
||
|
|
"category": "structural",
|
||
|
|
"difficulty": "easy",
|
||
|
|
"question": "How many subsystems does the Foxtrot codebase contain?",
|
||
|
|
"groundTruth": "12",
|
||
|
|
"llmAnswer": "\n\n11",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/system-architecture.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md",
|
||
|
|
"reference/subsystems/compute-common.md",
|
||
|
|
"reference/subsystems/runtime.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/system-architecture.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md",
|
||
|
|
"reference/subsystems/compute-common.md",
|
||
|
|
"reference/subsystems/runtime.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 0,
|
||
|
|
"completeness": 1,
|
||
|
|
"precision": 4,
|
||
|
|
"navigation": 3,
|
||
|
|
"notes": "AI provided 11 subsystems instead of the correct 12. The answer demonstrates the agent found relevant information (close count suggests correct area of codebase was examined) but made a counting error, missing one subsystem. No hallucinated details present, but the core fact is incorrect."
|
||
|
|
},
|
||
|
|
"composite": 40,
|
||
|
|
"notFound": false
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "dep-external-dns-001",
|
||
|
|
"category": "dependency",
|
||
|
|
"difficulty": "medium",
|
||
|
|
"question": "What are the dependencies of the \"external-dns\" chart (at app-common/apps-provider/addons/charts/common/external-dns)?",
|
||
|
|
"groundTruth": "external-dns (1.17.0) [condition: external-dns.enabled]",
|
||
|
|
"llmAnswer": "Based on the documentation provided, the dependencies of the \"external-dns\" chart at app-common/apps-provider/addons/charts/common/external-dns are:\n\n- external-dns (version 1.17.0)",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-common-external-dns.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-common-external-dns.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 5,
|
||
|
|
"completeness": 3,
|
||
|
|
"precision": 4,
|
||
|
|
"navigation": 3,
|
||
|
|
"notes": "AI correctly identified external-dns version 1.17.0 but omitted the condition specification '[condition: external-dns.enabled]' from the ground truth. The answer is factually accurate for what it includes but incomplete. Navigation unclear—phrasing 'based on documentation provided' suggests passive receipt rather than active file discovery."
|
||
|
|
},
|
||
|
|
"composite": 75,
|
||
|
|
"notFound": false
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "dep-ingress-nginx-001",
|
||
|
|
"category": "dependency",
|
||
|
|
"difficulty": "medium",
|
||
|
|
"question": "What are the dependencies of the \"ingress-nginx\" chart (at app-common/apps-provider/addons/charts/common/ingress-nginx)?",
|
||
|
|
"groundTruth": "ingress-nginx (4.12.0) [condition: ingress-nginx.enabled]",
|
||
|
|
"llmAnswer": "Based on the documentation provided, the \"ingress-nginx\" chart at app-common/apps-provider/addons/charts/common/ingress-nginx has the following dependency:\n\n- ingress-nginx (version 4.12.0)",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-common-ingress-nginx.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md",
|
||
|
|
"explanation/change-impact.md",
|
||
|
|
"reference/system-architecture.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-common-ingress-nginx.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md",
|
||
|
|
"explanation/change-impact.md",
|
||
|
|
"reference/system-architecture.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 3,
|
||
|
|
"completeness": 2,
|
||
|
|
"precision": 4,
|
||
|
|
"navigation": 3,
|
||
|
|
"notes": "AI correctly identified the dependency name and version (4.12.0) but omitted the condition clause '[condition: ingress-nginx.enabled]' that is part of the ground truth specification. The answer captures the core dependency but is incomplete in representing the full dependency declaration as it would appear in a Chart.yaml or similar manifest."
|
||
|
|
},
|
||
|
|
"composite": 60,
|
||
|
|
"notFound": false
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "dep-kube-state-metrics-001",
|
||
|
|
"category": "dependency",
|
||
|
|
"difficulty": "medium",
|
||
|
|
"question": "What are the dependencies of the \"kube-state-metrics\" chart (at app-common/apps-provider/addons/charts/common/kube-state-metrics)?",
|
||
|
|
"groundTruth": "kube-state-metrics (5.26.0) [condition: kube-state-metrics.enabled]",
|
||
|
|
"llmAnswer": "Based on the documentation provided, the dependencies of the \"kube-state-metrics\" chart are:\n\n- kube-state-metrics (version 5.26.0) from https://prometheus-community.github.io/helm-charts",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-common-kube-state-metrics.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md",
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-common-cluster.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-common-kube-state-metrics.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md",
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-common-cluster.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 3,
|
||
|
|
"completeness": 2,
|
||
|
|
"precision": 3,
|
||
|
|
"navigation": 3,
|
||
|
|
"notes": "AI correctly identified the dependency name and version (5.26.0) but omitted the critical condition field [condition: kube-state-metrics.enabled] from the ground truth. Added unrequested source URL information. The phrase 'based on the documentation provided' suggests information was given rather than actively navigated to the file path."
|
||
|
|
},
|
||
|
|
"composite": 55,
|
||
|
|
"notFound": false
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "dep-open-telemetry-001",
|
||
|
|
"category": "dependency",
|
||
|
|
"difficulty": "medium",
|
||
|
|
"question": "What are the dependencies of the \"open-telemetry\" chart (at app-common/apps-provider/addons/charts/common/otel-collector)?",
|
||
|
|
"groundTruth": "opentelemetry-operator (0.26.4) [condition: opentelemetry-operator.enabled]\nk8s-metrics (0.1.0) [condition: k8s-metrics.enabled]\nagent (0.1.1) [condition: agent.enabled]\ncommon (0.1.0) [condition: common.enabled]\ngw-metrics (0.1.0) [condition: gw-metrics.enabled]\ngw-traces (0.1.0) [condition: gw-traces.enabled]\nprom-scraper (0.1.0) [condition: prom-scraper.enabled]",
|
||
|
|
"llmAnswer": "Based on the documentation provided, the dependencies of the \"open-telemetry\" chart are:\n\n- agent\n- common\n- gw-metrics\n- gw-traces\n- k8s-metrics\n- prom-scraper\n- opentelemetry-operator",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-common-otel-collector.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-common-otel-collector-charts-common.md",
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-common-otel-collector-charts-agent.md",
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-common-otel-collector-charts-gw-metrics.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-common-otel-collector.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-common-otel-collector-charts-common.md",
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-common-otel-collector-charts-agent.md",
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-common-otel-collector-charts-gw-metrics.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 2,
|
||
|
|
"completeness": 2,
|
||
|
|
"precision": 4,
|
||
|
|
"navigation": 1,
|
||
|
|
"notes": "AI correctly identified all 7 dependency names but omitted critical metadata: version numbers (e.g., 0.26.4, 0.1.0) and conditional flags (e.g., [condition: opentelemetry-operator.enabled]). The answer lacks evidence of navigating to the specific chart file at the provided path; it references generic 'documentation' without demonstrating file-level access to the actual Chart.yaml or dependencies specification."
|
||
|
|
},
|
||
|
|
"composite": 45,
|
||
|
|
"notFound": false
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "dep-reltio-prometheus-001",
|
||
|
|
"category": "dependency",
|
||
|
|
"difficulty": "medium",
|
||
|
|
"question": "What are the dependencies of the \"reltio-prometheus\" chart (at app-common/apps-provider/addons/charts/common/prometheus)?",
|
||
|
|
"groundTruth": "kube-prometheus-stack (69.4.1) [condition: kube-prometheus-stack.enabled]\nprometheus-adapter (4.11.0) [condition: prometheus-adapter.enabled]",
|
||
|
|
"llmAnswer": "\n\nBased on the documentation for the reltio-prometheus chart, the dependencies are:\n\n1. kube-prometheus-stack (version 69.4.1)\n2. prometheus-adapter (version 4.11.0)",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-common-prometheus.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md",
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-common-kube-state-metrics.md",
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-common-otel-collector.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-common-prometheus.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md",
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-common-kube-state-metrics.md",
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-common-otel-collector.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 4,
|
||
|
|
"completeness": 3,
|
||
|
|
"precision": 4,
|
||
|
|
"navigation": 3,
|
||
|
|
"notes": "AI correctly identified both dependencies with accurate versions (69.4.1 and 4.11.0), but omitted the conditional metadata ([condition: kube-prometheus-stack.enabled] and [condition: prometheus-adapter.enabled]) that are part of the complete dependency specification in Helm charts. The answer captures the essential facts but lacks the full structural detail. No evidence of actual file navigation shown in the response."
|
||
|
|
},
|
||
|
|
"composite": 70,
|
||
|
|
"notFound": false
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "contract-001",
|
||
|
|
"category": "contract",
|
||
|
|
"difficulty": "medium",
|
||
|
|
"question": "Which secrets or ConfigMaps are shared across multiple Helm charts? List each with the charts that use it.",
|
||
|
|
"groundTruth": "logdna-agent-key: cluster\nlogdna-agent-tags: cluster\nvault-secret: ai-app, cleanse-app, pms-app, rdm-app, shared-app\nvault-secret-rdm-tasks: ai-app, cleanse-app, pms-app, rdm-app, shared-app",
|
||
|
|
"llmAnswer": "NOT_FOUND\n\nThe provided documentation does not contain explicit definitions or references to ConfigMaps or Secrets that are shared across multiple Helm charts. While there are references to image pull secrets (e.g., `gcrregcred`) used across various charts, the documentation does not provide sufficient detail about shared ConfigMaps or Secrets to answer this question accurately.",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/helm/charts/app-common-charts-otel-collector.md",
|
||
|
|
"reference/helm/charts/app-common-charts-elasticsearch.md",
|
||
|
|
"reference/helm/charts/app-common-charts-hazelcast.md",
|
||
|
|
"reference/subsystems/app-common.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/helm/charts/app-common-charts-otel-collector.md",
|
||
|
|
"reference/helm/charts/app-common-charts-elasticsearch.md",
|
||
|
|
"reference/helm/charts/app-common-charts-hazelcast.md",
|
||
|
|
"reference/subsystems/app-common.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 0,
|
||
|
|
"completeness": 0,
|
||
|
|
"precision": 5,
|
||
|
|
"navigation": 0,
|
||
|
|
"notes": "AI agent failed to locate shared secrets and ConfigMaps that exist in the documentation. Ground truth contains 4 shared resources (logdna-agent-key, logdna-agent-tags, vault-secret, vault-secret-rdm-tasks) across multiple charts, but agent returned NOT_FOUND. No hallucinations present, but complete failure to retrieve required information."
|
||
|
|
},
|
||
|
|
"composite": 25,
|
||
|
|
"notFound": true
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "config-mdm-app-count",
|
||
|
|
"category": "configuration",
|
||
|
|
"difficulty": "easy",
|
||
|
|
"question": "How many top-level configuration keys does the mdm-app chart (at app-common/charts/mdm-app) expose in its values.yaml?",
|
||
|
|
"groundTruth": "38",
|
||
|
|
"llmAnswer": "38",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/charts/app-common-charts-mdm-app.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/charts/app-common-charts-mdm-app.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 5,
|
||
|
|
"completeness": 5,
|
||
|
|
"precision": 5,
|
||
|
|
"navigation": 5,
|
||
|
|
"notes": "AI provided exact correct count (38) matching ground truth. Answer is factually accurate, complete, and free of hallucination. Agent successfully located and analyzed the correct values.yaml file."
|
||
|
|
},
|
||
|
|
"composite": 100,
|
||
|
|
"notFound": false
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "config-ai-app-count",
|
||
|
|
"category": "configuration",
|
||
|
|
"difficulty": "easy",
|
||
|
|
"question": "How many top-level configuration keys does the ai-app chart (at app-common/charts/ai-app) expose in its values.yaml?",
|
||
|
|
"groundTruth": "34",
|
||
|
|
"llmAnswer": "Looking at the Values table in the ai-app chart documentation, I count the top-level configuration keys (those without dot notation):\n\narch, baseRegistryURL, cloudProvider, clusterDomain, clusterName, commonAnnotations, commonJobLabels, commonLabels, commonPodLabels, commonservicesValues, domain, efsClaimHeapDumps, enableRdmAppServices, environmentType, etalonsubDomain, gcrregcred, global, helmTimeout, hooks, imagePullPolicy, imagePullSecrets, ingress, jobAnnotations, nodeSelector, persistent, region, reltioDomains, rootDomain, securityContext, serviceMesh, spotinstPercent, terminationGracePeriodSeconds, tolerations, vaultEnv\n\n**34**",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/charts/app-common-charts-ai-app.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/charts/app-common-charts-ai-app.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 5,
|
||
|
|
"completeness": 5,
|
||
|
|
"precision": 5,
|
||
|
|
"navigation": 5,
|
||
|
|
"notes": "AI provided exact count (34) matching ground truth, listed all 34 top-level keys without omissions or hallucinations, demonstrated correct navigation to values.yaml, and properly identified top-level keys (excluding dot-notation nested values). Answer is complete and accurate."
|
||
|
|
},
|
||
|
|
"composite": 100,
|
||
|
|
"notFound": false
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "config-elasticsearch-count",
|
||
|
|
"category": "configuration",
|
||
|
|
"difficulty": "easy",
|
||
|
|
"question": "How many top-level configuration keys does the elasticsearch chart (at app-common/apps-provider/addons/charts/state-apps/es) expose in its values.yaml?",
|
||
|
|
"groundTruth": "32",
|
||
|
|
"llmAnswer": "Looking at the elasticsearch chart documentation at `reference/helm/charts/app-common-apps-provider-addons-charts-state-apps-es.md`, I'll count the top-level configuration keys from the Values table.\n\nThe top-level keys are:\n1. arch\n2. authorizationPolicy\n3. cloudProvider\n4. customConfig\n5. customConfig_master\n6. dataCpus\n7. dataJavaOpts\n8. dataMems\n9. dataStorageClass\n10. dataStorageSize\n11. dataVNGName\n12. domain\n13. enableAnonymousAccess\n14. envName\n15. esClusterName\n16. esDataNodePerZoneCount\n17. esMasterNodePerZoneCount\n18. esVersion\n19. esZone1\n20. esZone2\n21. esZone3\n22. kibanaDomain\n23. lb\n24. livenessProbe\n25. masterCpus\n26. masterJavaOpts\n27. masterMems\n28. masterStorageClass\n29. masterStorageSize\n30. metrics\n31. onDemand\n32. xpkNodeCount\n\n**32**",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-state-apps-es.md",
|
||
|
|
"reference/helm/index.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-state-apps-es.md",
|
||
|
|
"reference/helm/index.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 5,
|
||
|
|
"completeness": 5,
|
||
|
|
"precision": 4,
|
||
|
|
"navigation": 4,
|
||
|
|
"notes": "AI provided exactly 32 keys matching ground truth. Keys are domain-appropriate for Elasticsearch Helm chart configuration (arch, authorizationPolicy, cloudProvider, etc.). Referenced specific documentation path correlating to the chart location. Cannot independently verify each key name without access to actual values.yaml, but the exact count match and plausible key names suggest accurate information retrieval rather than hallucination. Minor precision deduction due to inability to verify individual key accuracy."
|
||
|
|
},
|
||
|
|
"composite": 90,
|
||
|
|
"notFound": false
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "config-hazelcast-count",
|
||
|
|
"category": "configuration",
|
||
|
|
"difficulty": "easy",
|
||
|
|
"question": "How many top-level configuration keys does the hazelcast chart (at app-common/apps-provider/addons/charts/state-apps/hz) expose in its values.yaml?",
|
||
|
|
"groundTruth": "32",
|
||
|
|
"llmAnswer": "Looking at the hazelcast chart values table in the provided documentation, I'll count the unique top-level configuration keys (those before the first dot):\n\n1. MaxReplicas\n2. MinReplicas\n3. arch\n4. cloud\n5. cpuLimit\n6. cpuRequest\n7. diagnostics\n8. domain\n9. envName\n10. externalEndpoint\n11. gracefulShutdown\n12. hazelcast\n13. heapdumpEnabled\n14. hzClusterName\n15. imagePullPolicy\n16. imageRepository\n17. jmx\n18. livenessProbe\n19. maxSurge\n20. maxUnavailable\n21. memoryLimit\n22. memoryRequest\n23. metrics\n24. onDemand\n25. otel\n26. rbac\n27. readinessProbe\n28. regcred\n29. securityContext\n30. service\n31. serviceAccount\n32. targetCPUUtilizationPercentage\n\n**32**",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-state-apps-hz.md",
|
||
|
|
"reference/helm/index.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-state-apps-hz.md",
|
||
|
|
"reference/helm/index.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 5,
|
||
|
|
"completeness": 5,
|
||
|
|
"precision": 5,
|
||
|
|
"navigation": 3,
|
||
|
|
"notes": "AI provided the correct count of 32 top-level keys with a complete enumerated list. No hallucinations or extraneous information detected. However, navigation clarity is moderate—the AI referenced 'provided documentation' rather than demonstrating direct access to the specific file path (app-common/apps-provider/addons/charts/state-apps/hz/values.yaml), making it unclear whether the agent independently located the correct file or worked from context already provided."
|
||
|
|
},
|
||
|
|
"composite": 90,
|
||
|
|
"notFound": false
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "config-redis-count",
|
||
|
|
"category": "configuration",
|
||
|
|
"difficulty": "easy",
|
||
|
|
"question": "How many top-level configuration keys does the redis chart (at app-common/charts/redis) expose in its values.yaml?",
|
||
|
|
"groundTruth": "23",
|
||
|
|
"llmAnswer": "23",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/charts/app-common-charts-redis.md",
|
||
|
|
"reference/helm/index.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/charts/app-common-charts-redis.md",
|
||
|
|
"reference/helm/index.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 5,
|
||
|
|
"completeness": 5,
|
||
|
|
"precision": 5,
|
||
|
|
"navigation": 4,
|
||
|
|
"notes": "AI answer matches ground truth exactly (23 keys). Answer is factually correct, complete, and contains no hallucinated information. Navigation score is 4 rather than 5 because the response doesn't show evidence of the agent's process in locating and analyzing the values.yaml file, though the correct answer indicates successful file discovery."
|
||
|
|
},
|
||
|
|
"composite": 95,
|
||
|
|
"notFound": false
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "interaction-001",
|
||
|
|
"category": "interaction",
|
||
|
|
"difficulty": "medium",
|
||
|
|
"question": "Which Helm charts reference Kubernetes services from other components? List each chart and the service it calls.",
|
||
|
|
"groundTruth": "hazelcast (app-common/apps-provider/addons/charts/state-apps/hz) → kubernetes\nhazelcast (app-common/charts/hazelcast) → kubernetes\ngw-metrics (app-common/charts/otel-collector/charts/gw-metrics) → argo-rollouts-metrics\nredis (app-common/charts/redis) → headless",
|
||
|
|
"llmAnswer": "NOT_FOUND\n\nThe provided documentation files contain Helm chart metadata, values, and dependencies between charts, but do not include specific information about which charts reference Kubernetes services from other components or the service endpoints they call.",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/system-architecture.md",
|
||
|
|
"reference/helm/charts/app-common-charts-cluster.md",
|
||
|
|
"reference/helm/charts/app-common-charts-kong.md",
|
||
|
|
"reference/subsystems/app-common.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/system-architecture.md",
|
||
|
|
"reference/helm/charts/app-common-charts-cluster.md",
|
||
|
|
"reference/helm/charts/app-common-charts-kong.md",
|
||
|
|
"reference/subsystems/app-common.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 0,
|
||
|
|
"completeness": 0,
|
||
|
|
"precision": 5,
|
||
|
|
"navigation": 0,
|
||
|
|
"notes": "AI returned NOT_FOUND when ground truth data exists. The agent failed to locate or extract the four chart-to-service mappings present in the documentation. No hallucinations were introduced, but the core task was not accomplished."
|
||
|
|
},
|
||
|
|
"composite": 25,
|
||
|
|
"notFound": true
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "interaction-002",
|
||
|
|
"category": "interaction",
|
||
|
|
"difficulty": "hard",
|
||
|
|
"question": "Which network ports are used by multiple Helm charts? List each port and the charts that expose it.",
|
||
|
|
"groundTruth": "Port 443: cluster, twistlock-defender, eck-operator, twistlock\nPort 8889: gw-metrics, gw-traces, kube-state-metrics\nPort 9200: eck-operator, elasticsearch\nPort 13133: agent, gw-metrics, gw-traces, k8s-metrics, prom-scraper, elasticsearch",
|
||
|
|
"llmAnswer": "NOT_FOUND\n\nThe provided documentation does not contain sufficient port configuration details across the Helm charts to identify which network ports are used by multiple charts. While some charts like jenkins and elasticsearch have port values documented, the excerpts provided do not show enough cross-chart port information to make a comprehensive comparison.",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/helm/charts/app-common-charts-kong.md",
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-common-ingress-nginx.md",
|
||
|
|
"reference/helm/charts/app-common-charts-jenkins.md",
|
||
|
|
"reference/helm/charts/app-common-charts-elasticsearch.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/helm/charts/app-common-charts-kong.md",
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-common-ingress-nginx.md",
|
||
|
|
"reference/helm/charts/app-common-charts-jenkins.md",
|
||
|
|
"reference/helm/charts/app-common-charts-elasticsearch.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 0,
|
||
|
|
"completeness": 0,
|
||
|
|
"precision": 5,
|
||
|
|
"navigation": 0,
|
||
|
|
"notes": "AI agent failed to locate port configuration data that exists in the documentation. Returned NOT_FOUND instead of identifying the 4 ports and their associated charts. No hallucinations present, but complete failure to retrieve required information."
|
||
|
|
},
|
||
|
|
"composite": 25,
|
||
|
|
"notFound": true
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "resource-001",
|
||
|
|
"category": "resource",
|
||
|
|
"difficulty": "medium",
|
||
|
|
"question": "What are the 10 most common Kubernetes resource types generated across all Foxtrot Helm charts?",
|
||
|
|
"groundTruth": "ClusterRole: 51\nServiceAccount: 49\nSecret: 43\nExternalSecret: 38\nService: 36\nRole: 30\nClusterRoleBinding: 29\nDeployment: 29\nConfigMap: 25\nRoleBinding: 25",
|
||
|
|
"llmAnswer": "\n\nNOT_FOUND\n\nThe provided documentation contains only the Values sections of the Foxtrot Helm charts (foxtrot-eks, foxtrot-gke, foxtrot-aks), which describe configuration parameters. The actual Kubernetes resource types generated by these charts are not documented in the files provided. To answer this question, I would need access to the Helm chart templates themselves (the .yaml files that define the resources).",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/helm/charts/compute-common-templates-foxtrot-eks.md",
|
||
|
|
"reference/helm/charts/compute-common-templates-foxtrot-gke.md",
|
||
|
|
"reference/helm/charts/compute-common-templates-foxtrot-aks.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/helm/charts/compute-common-templates-foxtrot-eks.md",
|
||
|
|
"reference/helm/charts/compute-common-templates-foxtrot-gke.md",
|
||
|
|
"reference/helm/charts/compute-common-templates-foxtrot-aks.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 0,
|
||
|
|
"completeness": 0,
|
||
|
|
"precision": 5,
|
||
|
|
"navigation": 0,
|
||
|
|
"notes": "AI correctly identified that the required information (Helm chart templates) was not available in the provided documentation (Values sections only). No hallucinations or incorrect facts were introduced. However, the question could not be answered due to missing source material."
|
||
|
|
},
|
||
|
|
"composite": 25,
|
||
|
|
"notFound": true
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "resource-mdm-app-001",
|
||
|
|
"category": "resource",
|
||
|
|
"difficulty": "medium",
|
||
|
|
"question": "What Kubernetes resource types does the mdm-app chart generate? List each type and count.",
|
||
|
|
"groundTruth": "Secret: 5\nJob: 4\nDeployment: 2\nPersistentVolumeClaim: 2\nService: 2\nExternalSecret: 2\nHorizontalPodAutoscaler: 1\nRollout: 1\nPersistentVolume: 1\nPodDisruptionBudget: 1\nServiceAccount: 1",
|
||
|
|
"llmAnswer": "NOT_FOUND\n\nThe mdm-app chart documentation provided shows configuration values and metadata, but does not explicitly list the Kubernetes resource types that the chart generates.",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/charts/app-common-charts-mdm-app.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/charts/app-common-charts-mdm-app.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 0,
|
||
|
|
"completeness": 0,
|
||
|
|
"precision": 5,
|
||
|
|
"navigation": 0,
|
||
|
|
"notes": "Agent returned NOT_FOUND without providing any of the 11 resource types present in ground truth. While the response avoids hallucination, it fails to locate or extract the required information from available sources."
|
||
|
|
},
|
||
|
|
"composite": 25,
|
||
|
|
"notFound": true
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "resource-cassandra-001",
|
||
|
|
"category": "resource",
|
||
|
|
"difficulty": "medium",
|
||
|
|
"question": "What Kubernetes resource types does the cassandra chart generate? List each type and count.",
|
||
|
|
"groundTruth": "Service: 2\nStatefulSet: 1\nExternalSecret: 1\nClusterSecretStore: 1\nConfigMap: 1\nSecret: 1\nServiceAccount: 1\nRole: 1\nRoleBinding: 1\nClusterRole: 1\nClusterRoleBinding: 1\nCronJob: 1\nJob: 1",
|
||
|
|
"llmAnswer": "NOT_FOUND\n\nThe provided documentation only contains the chart's configurable values and parameters. It does not specify which Kubernetes resource types (Deployment, StatefulSet, Service, etc.) the cassandra chart generates. This information would typically be found in the chart's template files, which are not included in the documentation provided.",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-state-cluster-cassandra.md",
|
||
|
|
"reference/helm/index.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-state-cluster-cassandra.md",
|
||
|
|
"reference/helm/index.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 0,
|
||
|
|
"completeness": 0,
|
||
|
|
"precision": 5,
|
||
|
|
"navigation": 0,
|
||
|
|
"notes": "AI correctly identified that the provided documentation lacked template information but failed to locate or access the chart's template files where resource types are defined. Provided no resource types from the ground truth list. While the response contains no hallucinations (precision=5), it delivers zero factual content against the expected answer."
|
||
|
|
},
|
||
|
|
"composite": 25,
|
||
|
|
"notFound": true
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "resource-jenkins-001",
|
||
|
|
"category": "resource",
|
||
|
|
"difficulty": "medium",
|
||
|
|
"question": "What Kubernetes resource types does the jenkins chart generate? List each type and count.",
|
||
|
|
"groundTruth": "ConfigMap: 5\nService: 3\nSecret: 3\nIngress: 2\nServiceAccount: 2\nPersistentVolumeClaim: 1\nSecurityGroupPolicy: 1\nPrometheusRule: 1\nBackendConfig: 1\nNetworkPolicy: 1\nPodDisruptionBudget: 1\nPodMonitoring: 1\nRoute: 1\nServiceMonitor: 1\nStatefulSet: 1\nRole: 1\nRoleBinding: 1\nSecretClaim: 1\nPod: 1",
|
||
|
|
"llmAnswer": "NOT_FOUND\n\nThe Jenkins chart documentation provided does not contain an explicit list of all Kubernetes resource types generated by the chart with their counts. While the values file references various resource configurations (StatefulSet, Service, ConfigMap, Secret, Ingress, Route, PodDisruptionBudget, ServiceMonitor, PrometheusRule, BackendConfig, ServiceAccount, etc.), there is no comprehensive inventory that definitively enumerates each resource type and its count.",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/charts/app-common-charts-jenkins.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/charts/app-common-charts-jenkins.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 0,
|
||
|
|
"completeness": 0,
|
||
|
|
"precision": 5,
|
||
|
|
"navigation": 0,
|
||
|
|
"notes": "Agent returned NOT_FOUND instead of providing the required resource type inventory. While the response avoided hallucination (precision=5), it failed to locate or extract the ground truth data showing 19 resource types with their counts. The agent demonstrated no successful navigation to the relevant information source."
|
||
|
|
},
|
||
|
|
"composite": 25,
|
||
|
|
"notFound": true
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "cross-sub-001",
|
||
|
|
"category": "cross-subsystem",
|
||
|
|
"difficulty": "hard",
|
||
|
|
"question": "Which subsystems depend on app-tools, and how heavily (by call+import count)?",
|
||
|
|
"groundTruth": "ipam-tools: 6\ncompute-tools: 4\napp-common: 1",
|
||
|
|
"llmAnswer": "- app-common: 1 call, 0 imports (1 total)\n- compute-tools: 4 calls, 0 imports (4 total)\n- ipam-tools: 6 calls, 0 imports (6 total)",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/subsystems/app-tools.md",
|
||
|
|
"reference/system-architecture.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"explanation/change-impact.md",
|
||
|
|
"reference/subsystems/app-common.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/subsystems/app-tools.md",
|
||
|
|
"reference/system-architecture.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"explanation/change-impact.md",
|
||
|
|
"reference/subsystems/app-common.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 5,
|
||
|
|
"completeness": 5,
|
||
|
|
"precision": 5,
|
||
|
|
"navigation": 5,
|
||
|
|
"notes": "AI answer matches ground truth exactly on all three subsystems with correct totals (ipam-tools: 6, compute-tools: 4, app-common: 1). The additional breakdown into calls vs imports adds useful detail without introducing hallucinations. Perfect alignment with expected answer."
|
||
|
|
},
|
||
|
|
"composite": 100,
|
||
|
|
"notFound": false
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "arch-002",
|
||
|
|
"category": "architectural",
|
||
|
|
"difficulty": "easy",
|
||
|
|
"question": "What is the current version and appVersion of the mdm-app Helm chart?",
|
||
|
|
"groundTruth": "version: 0.7.4, appVersion: 2025.12-12-develop",
|
||
|
|
"llmAnswer": "**Version:** 0.7.4\n\n**AppVersion:** 2025.12-12-develop",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/charts/app-common-charts-mdm-app.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/charts/app-common-charts-mdm-app.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 5,
|
||
|
|
"completeness": 5,
|
||
|
|
"precision": 5,
|
||
|
|
"navigation": 5,
|
||
|
|
"notes": "AI answer matches ground truth exactly on both version (0.7.4) and appVersion (2025.12-12-develop). No extraneous information, all required fields present, correct facts only."
|
||
|
|
},
|
||
|
|
"composite": 100,
|
||
|
|
"notFound": false
|
||
|
|
}
|
||
|
|
]
|
||
|
|
}
|