721 lines
35 KiB
JSON
721 lines
35 KiB
JSON
|
|
{
|
||
|
|
"evalType": "agent",
|
||
|
|
"timestamp": "2026-03-10T00:33:05.446Z",
|
||
|
|
"docsDir": "./foxtrot-docs",
|
||
|
|
"totalQuestions": 22,
|
||
|
|
"overallScore": "88.4",
|
||
|
|
"avgAccuracy": "4.59",
|
||
|
|
"avgCompleteness": "4.50",
|
||
|
|
"avgPrecision": "4.55",
|
||
|
|
"avgNavigation": "4.05",
|
||
|
|
"notFoundCount": 0,
|
||
|
|
"notFoundRate": "0.0%",
|
||
|
|
"byCategory": {
|
||
|
|
"structural": {
|
||
|
|
"avg": "91.7",
|
||
|
|
"count": 3
|
||
|
|
},
|
||
|
|
"dependency": {
|
||
|
|
"avg": "89.0",
|
||
|
|
"count": 5
|
||
|
|
},
|
||
|
|
"contract": {
|
||
|
|
"avg": "65.0",
|
||
|
|
"count": 1
|
||
|
|
},
|
||
|
|
"configuration": {
|
||
|
|
"avg": "87.0",
|
||
|
|
"count": 5
|
||
|
|
},
|
||
|
|
"interaction": {
|
||
|
|
"avg": "62.5",
|
||
|
|
"count": 2
|
||
|
|
},
|
||
|
|
"resource": {
|
||
|
|
"avg": "100.0",
|
||
|
|
"count": 4
|
||
|
|
},
|
||
|
|
"cross-subsystem": {
|
||
|
|
"avg": "100.0",
|
||
|
|
"count": 1
|
||
|
|
},
|
||
|
|
"architectural": {
|
||
|
|
"avg": "100.0",
|
||
|
|
"count": 1
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"byDifficulty": {
|
||
|
|
"easy": {
|
||
|
|
"avg": "89.4",
|
||
|
|
"count": 8
|
||
|
|
},
|
||
|
|
"medium": {
|
||
|
|
"avg": "92.1",
|
||
|
|
"count": 12
|
||
|
|
},
|
||
|
|
"hard": {
|
||
|
|
"avg": "62.5",
|
||
|
|
"count": 2
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"results": [
|
||
|
|
{
|
||
|
|
"id": "struct-001",
|
||
|
|
"category": "structural",
|
||
|
|
"difficulty": "easy",
|
||
|
|
"question": "How many Helm charts are defined across the Foxtrot monorepo?",
|
||
|
|
"groundTruth": "76",
|
||
|
|
"llmAnswer": "\n\n76",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md",
|
||
|
|
"reference/subsystems/runtime.md",
|
||
|
|
"reference/system-architecture.md",
|
||
|
|
"reference/contracts/index.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md",
|
||
|
|
"reference/subsystems/runtime.md",
|
||
|
|
"reference/system-architecture.md",
|
||
|
|
"reference/contracts/index.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 5,
|
||
|
|
"completeness": 5,
|
||
|
|
"precision": 5,
|
||
|
|
"navigation": 2,
|
||
|
|
"notes": "AI provided the exact correct answer (76), but with zero explanation or demonstration of how it found this information. No working shown, no file paths referenced, no methodology explained. While the accuracy is perfect, the complete lack of transparency into the navigation process makes it impossible to verify the agent actually located and analyzed the relevant information versus providing a memorized or lucky answer."
|
||
|
|
},
|
||
|
|
"composite": 85,
|
||
|
|
"notFound": false
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "struct-002",
|
||
|
|
"category": "structural",
|
||
|
|
"difficulty": "medium",
|
||
|
|
"question": "Which 5 Helm charts produce the most Kubernetes resources? List them with their resource counts.",
|
||
|
|
"groundTruth": "cluster (app-common/apps-provider/addons/charts/common/cluster): 55\ncluster (app-common/charts/cluster): 32\njenkins (app-common/charts/jenkins): 29\nkube-state-metrics (app-common/charts/otel-collector/charts/kube-state-metrics): 27\nfoxtrot-eks (compute-common/templates/foxtrot-eks): 24",
|
||
|
|
"llmAnswer": "1. cluster (`app-common/apps-provider/addons/charts/common/cluster`) - 55 resources\n2. cluster (`app-common/charts/cluster`) - 32 resources\n3. jenkins (`app-common/charts/jenkins`) - 29 resources\n4. kube-state-metrics (`app-common/charts/otel-collector/charts/kube-state-metrics`) - 27 resources\n5. foxtrot-eks (`compute-common/templates/foxtrot-eks`) - 24 resources",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/contracts/index.md",
|
||
|
|
"reference/system-architecture.md",
|
||
|
|
"reference/subsystems/app-common.md",
|
||
|
|
"reference/subsystems/runtime.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/contracts/index.md",
|
||
|
|
"reference/system-architecture.md",
|
||
|
|
"reference/subsystems/app-common.md",
|
||
|
|
"reference/subsystems/runtime.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 5,
|
||
|
|
"completeness": 5,
|
||
|
|
"precision": 5,
|
||
|
|
"navigation": 4,
|
||
|
|
"notes": "AI answer matches ground truth exactly across all 5 items with correct resource counts and paths in proper ranking order. No hallucinations or extraneous information. Navigation score is 4 rather than 5 because process visibility is limited, but the specificity and accuracy of results strongly indicate effective information retrieval."
|
||
|
|
},
|
||
|
|
"composite": 95,
|
||
|
|
"notFound": false
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "struct-003",
|
||
|
|
"category": "structural",
|
||
|
|
"difficulty": "easy",
|
||
|
|
"question": "How many subsystems does the Foxtrot codebase contain?",
|
||
|
|
"groundTruth": "12",
|
||
|
|
"llmAnswer": "\n\n12",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/system-architecture.md",
|
||
|
|
"reference/subsystems/root.md",
|
||
|
|
"reference/subsystems/app-common.md",
|
||
|
|
"reference/subsystems/control-core.md",
|
||
|
|
"reference/subsystems/runtime.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/system-architecture.md",
|
||
|
|
"reference/subsystems/root.md",
|
||
|
|
"reference/subsystems/app-common.md",
|
||
|
|
"reference/subsystems/control-core.md",
|
||
|
|
"reference/subsystems/runtime.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 5,
|
||
|
|
"completeness": 5,
|
||
|
|
"precision": 5,
|
||
|
|
"navigation": 4,
|
||
|
|
"notes": "AI provided exact correct answer matching ground truth. Answer is concise with no hallucinations or extraneous information. Navigation scored 4 rather than 5 due to lack of visible process documentation showing direct path to source."
|
||
|
|
},
|
||
|
|
"composite": 95,
|
||
|
|
"notFound": false
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "dep-external-dns-001",
|
||
|
|
"category": "dependency",
|
||
|
|
"difficulty": "medium",
|
||
|
|
"question": "What are the dependencies of the \"external-dns\" chart (at app-common/apps-provider/addons/charts/common/external-dns)?",
|
||
|
|
"groundTruth": "external-dns (1.17.0) [condition: external-dns.enabled]",
|
||
|
|
"llmAnswer": "Based on the documentation provided, the dependencies of the \"external-dns\" chart at `app-common/apps-provider/addons/charts/common/external-dns` are:\n\n- **external-dns** (1.17.0) *if external-dns.enabled*",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-common-external-dns.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md",
|
||
|
|
"reference/contracts/index.md",
|
||
|
|
"diagrams/app-common-contracts.mmd"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-common-external-dns.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md",
|
||
|
|
"reference/contracts/index.md",
|
||
|
|
"diagrams/app-common-contracts.mmd"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 5,
|
||
|
|
"completeness": 5,
|
||
|
|
"precision": 5,
|
||
|
|
"navigation": 4,
|
||
|
|
"notes": "AI correctly identified the single dependency (external-dns 1.17.0) with the exact condition (external-dns.enabled). No hallucinations or extraneous information. Formatting differs slightly from ground truth (asterisks vs brackets for condition) but factual content is identical. Navigation score reduced from 5 only because the response doesn't explicitly show the path taken to locate the information, though the accuracy indicates successful file discovery."
|
||
|
|
},
|
||
|
|
"composite": 95,
|
||
|
|
"notFound": false
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "dep-ingress-nginx-001",
|
||
|
|
"category": "dependency",
|
||
|
|
"difficulty": "medium",
|
||
|
|
"question": "What are the dependencies of the \"ingress-nginx\" chart (at app-common/apps-provider/addons/charts/common/ingress-nginx)?",
|
||
|
|
"groundTruth": "ingress-nginx (4.12.0) [condition: ingress-nginx.enabled]",
|
||
|
|
"llmAnswer": "\n\nBased on the documentation provided, the dependencies of the \"ingress-nginx\" chart at `app-common/apps-provider/addons/charts/common/ingress-nginx` are:\n\n- ingress-nginx (4.12.0) *if ingress-nginx.enabled*",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-common-ingress-nginx.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md",
|
||
|
|
"reference/contracts/index.md",
|
||
|
|
"diagrams/app-common-contracts.mmd"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-common-ingress-nginx.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md",
|
||
|
|
"reference/contracts/index.md",
|
||
|
|
"diagrams/app-common-contracts.mmd"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 5,
|
||
|
|
"completeness": 5,
|
||
|
|
"precision": 5,
|
||
|
|
"navigation": 4,
|
||
|
|
"notes": "AI correctly identified the dependency (ingress-nginx 4.12.0) and its condition. Answer matches ground truth exactly in content. Presentation of condition as 'if ingress-nginx.enabled' is clear and accurate. Navigation score reduced slightly because no explicit demonstration of file discovery process is shown, though correct answer suggests proper location was found."
|
||
|
|
},
|
||
|
|
"composite": 95,
|
||
|
|
"notFound": false
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "dep-kube-state-metrics-001",
|
||
|
|
"category": "dependency",
|
||
|
|
"difficulty": "medium",
|
||
|
|
"question": "What are the dependencies of the \"kube-state-metrics\" chart (at app-common/apps-provider/addons/charts/common/kube-state-metrics)?",
|
||
|
|
"groundTruth": "kube-state-metrics (5.26.0) [condition: kube-state-metrics.enabled]",
|
||
|
|
"llmAnswer": "\n\nBased on the documentation provided, the dependencies of the \"kube-state-metrics\" chart are:\n\n- kube-state-metrics (5.26.0) *if kube-state-metrics.enabled*",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-common-kube-state-metrics.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md",
|
||
|
|
"reference/contracts/index.md",
|
||
|
|
"reference/system-architecture.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-common-kube-state-metrics.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md",
|
||
|
|
"reference/contracts/index.md",
|
||
|
|
"reference/system-architecture.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 5,
|
||
|
|
"completeness": 5,
|
||
|
|
"precision": 5,
|
||
|
|
"navigation": 4,
|
||
|
|
"notes": "AI correctly identified the dependency with exact version (5.26.0) and conditional requirement (kube-state-metrics.enabled). Answer matches ground truth completely with no hallucinations. Navigation score slightly lower due to lack of visible evidence of file discovery process, though correct answer indicates successful location of source material."
|
||
|
|
},
|
||
|
|
"composite": 95,
|
||
|
|
"notFound": false
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "dep-open-telemetry-001",
|
||
|
|
"category": "dependency",
|
||
|
|
"difficulty": "medium",
|
||
|
|
"question": "What are the dependencies of the \"open-telemetry\" chart (at app-common/apps-provider/addons/charts/common/otel-collector)?",
|
||
|
|
"groundTruth": "opentelemetry-operator (0.26.4) [condition: opentelemetry-operator.enabled]\nk8s-metrics (0.1.0) [condition: k8s-metrics.enabled]\nagent (0.1.1) [condition: agent.enabled]\ncommon (0.1.0) [condition: common.enabled]\ngw-metrics (0.1.0) [condition: gw-metrics.enabled]\ngw-traces (0.1.0) [condition: gw-traces.enabled]\nprom-scraper (0.1.0) [condition: prom-scraper.enabled]",
|
||
|
|
"llmAnswer": "\n\n- opentelemetry-operator (0.26.4)\n- k8s-metrics (0.1.0)\n- agent (0.1.1)\n- common (0.1.0)\n- gw-metrics (0.1.0)\n- gw-traces (0.1.0)\n- prom-scraper (0.1.0)",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-common-otel-collector.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md",
|
||
|
|
"reference/contracts/index.md",
|
||
|
|
"diagrams/app-common-contracts.mmd"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-common-otel-collector.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md",
|
||
|
|
"reference/contracts/index.md",
|
||
|
|
"diagrams/app-common-contracts.mmd"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 5,
|
||
|
|
"completeness": 4,
|
||
|
|
"precision": 4,
|
||
|
|
"navigation": 4,
|
||
|
|
"notes": "AI correctly identified all 7 dependencies with accurate names and versions. However, the answer omits the conditional flags (e.g., '[condition: opentelemetry-operator.enabled]') that are part of the ground truth specification. This represents incomplete capture of the dependency metadata, though the core dependency information is accurate."
|
||
|
|
},
|
||
|
|
"composite": 85,
|
||
|
|
"notFound": false
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "dep-reltio-prometheus-001",
|
||
|
|
"category": "dependency",
|
||
|
|
"difficulty": "medium",
|
||
|
|
"question": "What are the dependencies of the \"reltio-prometheus\" chart (at app-common/apps-provider/addons/charts/common/prometheus)?",
|
||
|
|
"groundTruth": "kube-prometheus-stack (69.4.1) [condition: kube-prometheus-stack.enabled]\nprometheus-adapter (4.11.0) [condition: prometheus-adapter.enabled]",
|
||
|
|
"llmAnswer": "Based on the documentation for the reltio-prometheus chart, the dependencies are:\n\n- kube-prometheus-stack (69.4.1)\n- prometheus-adapter (4.11.0)",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-common-prometheus.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md",
|
||
|
|
"reference/contracts/index.md",
|
||
|
|
"reference/system-architecture.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-common-prometheus.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md",
|
||
|
|
"reference/contracts/index.md",
|
||
|
|
"reference/system-architecture.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 5,
|
||
|
|
"completeness": 3,
|
||
|
|
"precision": 4,
|
||
|
|
"navigation": 3,
|
||
|
|
"notes": "AI correctly identified both dependencies with accurate versions, but omitted the conditional clauses ([condition: kube-prometheus-stack.enabled] and [condition: prometheus-adapter.enabled]) that are part of the ground truth. These conditions are critical metadata in Helm chart dependencies. No clear evidence of navigation to the actual Chart.yaml file shown in the response."
|
||
|
|
},
|
||
|
|
"composite": 75,
|
||
|
|
"notFound": false
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "contract-001",
|
||
|
|
"category": "contract",
|
||
|
|
"difficulty": "medium",
|
||
|
|
"question": "Which secrets or ConfigMaps are shared across multiple Helm charts? List each with the charts that use it.",
|
||
|
|
"groundTruth": "logdna-agent-key: cluster\nlogdna-agent-tags: cluster\nvault-secret: ai-app, cleanse-app, pms-app, rdm-app, shared-app\nvault-secret-rdm-tasks: ai-app, cleanse-app, pms-app, rdm-app, shared-app",
|
||
|
|
"llmAnswer": "Based on the helm-interactions diagram in the provided documentation, the following secrets/ConfigMaps are shared across multiple Helm charts:\n\n1. **logdna-agent-key**\n - cluster (app-common/apps-provider/addons/charts/common/cluster)\n - cluster (app-common/charts/cluster)\n\n2. **logdna-agent-tags**\n - cluster (app-common/apps-provider/addons/charts/common/cluster)\n - cluster (app-common/charts/cluster)\n\n3. **vault-secret**\n - ai-app\n - cleanse-app\n - pms-app\n - rdm-app\n - shared-app\n\n4. **vault-secret-rdm-tasks**\n - ai-app\n - cleanse-app\n - pms-app\n - rdm-app\n - shared-app",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/contracts/index.md",
|
||
|
|
"reference/subsystems/app-common.md",
|
||
|
|
"reference/system-architecture.md",
|
||
|
|
"diagrams/helm-interactions.mmd"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/contracts/index.md",
|
||
|
|
"reference/subsystems/app-common.md",
|
||
|
|
"reference/system-architecture.md",
|
||
|
|
"diagrams/helm-interactions.mmd"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 3,
|
||
|
|
"completeness": 5,
|
||
|
|
"precision": 2,
|
||
|
|
"navigation": 3,
|
||
|
|
"notes": "The answer correctly identifies all four secrets/ConfigMaps and their associations. However, it duplicates 'cluster' entries for logdna-agent-key and logdna-agent-tags (showing two different paths), when the ground truth shows each should appear once. This duplication represents hallucinated extra information not in the ground truth. The vault-secret entries are accurate and precise. The agent demonstrated ability to locate relevant documentation but may have misinterpreted the source data regarding the cluster entries."
|
||
|
|
},
|
||
|
|
"composite": 65,
|
||
|
|
"notFound": false
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "config-mdm-app-count",
|
||
|
|
"category": "configuration",
|
||
|
|
"difficulty": "easy",
|
||
|
|
"question": "How many top-level configuration keys does the mdm-app chart (at app-common/charts/mdm-app) expose in its values.yaml?",
|
||
|
|
"groundTruth": "38",
|
||
|
|
"llmAnswer": "38",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/charts/app-common-charts-mdm-app.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/charts/app-common-charts-mdm-app.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 5,
|
||
|
|
"completeness": 5,
|
||
|
|
"precision": 5,
|
||
|
|
"navigation": 5,
|
||
|
|
"notes": "AI provided exact correct count (38) matching ground truth. Answer is factually accurate, complete, and free of hallucination. Agent successfully located and analyzed the correct values.yaml file."
|
||
|
|
},
|
||
|
|
"composite": 100,
|
||
|
|
"notFound": false
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "config-ai-app-count",
|
||
|
|
"category": "configuration",
|
||
|
|
"difficulty": "easy",
|
||
|
|
"question": "How many top-level configuration keys does the ai-app chart (at app-common/charts/ai-app) expose in its values.yaml?",
|
||
|
|
"groundTruth": "34",
|
||
|
|
"llmAnswer": "34",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/charts/app-common-charts-ai-app.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/charts/app-common-charts-ai-app.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 5,
|
||
|
|
"completeness": 5,
|
||
|
|
"precision": 5,
|
||
|
|
"navigation": 4,
|
||
|
|
"notes": "AI provided the exact correct count of 34 top-level keys. Answer is factually accurate, complete for the question asked, and contains no extraneous information. Navigation score is 4 rather than 5 because while the correct answer indicates the agent found the right file, the evaluation lacks explicit evidence of the navigation process or steps taken to arrive at the answer."
|
||
|
|
},
|
||
|
|
"composite": 95,
|
||
|
|
"notFound": false
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "config-elasticsearch-count",
|
||
|
|
"category": "configuration",
|
||
|
|
"difficulty": "easy",
|
||
|
|
"question": "How many top-level configuration keys does the elasticsearch chart (at app-common/apps-provider/addons/charts/state-apps/es) expose in its values.yaml?",
|
||
|
|
"groundTruth": "32",
|
||
|
|
"llmAnswer": "32",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-state-apps-es.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-state-apps-es.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 5,
|
||
|
|
"completeness": 5,
|
||
|
|
"precision": 5,
|
||
|
|
"navigation": 5,
|
||
|
|
"notes": "AI answer matches ground truth exactly. The agent correctly identified 32 top-level configuration keys in the elasticsearch chart's values.yaml file. No extraneous information or hallucination present. Perfect alignment with expected answer."
|
||
|
|
},
|
||
|
|
"composite": 100,
|
||
|
|
"notFound": false
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "config-hazelcast-count",
|
||
|
|
"category": "configuration",
|
||
|
|
"difficulty": "easy",
|
||
|
|
"question": "How many top-level configuration keys does the hazelcast chart (at app-common/apps-provider/addons/charts/state-apps/hz) expose in its values.yaml?",
|
||
|
|
"groundTruth": "32",
|
||
|
|
"llmAnswer": "32",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-state-apps-hz.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-state-apps-hz.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 5,
|
||
|
|
"completeness": 5,
|
||
|
|
"precision": 5,
|
||
|
|
"navigation": 4,
|
||
|
|
"notes": "AI provided the exact correct answer (32) matching ground truth. No hallucinated information or extraneous details. Navigation score slightly lower because the response doesn't show the agent's process of locating and examining the values.yaml file, though the correct answer strongly implies successful file discovery."
|
||
|
|
},
|
||
|
|
"composite": 95,
|
||
|
|
"notFound": false
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "config-redis-count",
|
||
|
|
"category": "configuration",
|
||
|
|
"difficulty": "easy",
|
||
|
|
"question": "How many top-level configuration keys does the redis chart (at app-common/charts/redis) expose in its values.yaml?",
|
||
|
|
"groundTruth": "23",
|
||
|
|
"llmAnswer": "\n\n22",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/charts/app-common-charts-redis.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/charts/app-common-charts-redis.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 1,
|
||
|
|
"completeness": 1,
|
||
|
|
"precision": 4,
|
||
|
|
"navigation": 3,
|
||
|
|
"notes": "AI provided 22 instead of 23 top-level keys. The answer is off by exactly one, suggesting the agent likely located the correct values.yaml file but miscounted or overlooked a single configuration key. The response format is precise but the factual content is incorrect."
|
||
|
|
},
|
||
|
|
"composite": 45,
|
||
|
|
"notFound": false
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "interaction-001",
|
||
|
|
"category": "interaction",
|
||
|
|
"difficulty": "medium",
|
||
|
|
"question": "Which Helm charts reference Kubernetes services from other components? List each chart and the service it calls.",
|
||
|
|
"groundTruth": "hazelcast (app-common/apps-provider/addons/charts/state-apps/hz) → kubernetes\nhazelcast (app-common/charts/hazelcast) → kubernetes\ngw-metrics (app-common/charts/otel-collector/charts/gw-metrics) → argo-rollouts-metrics\nredis (app-common/charts/redis) → headless",
|
||
|
|
"llmAnswer": "Based on the helm-interactions diagram in the provided files, the following Helm charts reference Kubernetes services from other components:\n\n1. **hazelcast** (`app-common/apps-provider/addons/charts/state-apps/hz`) → `kubernetes` service\n2. **hazelcast** (`app-common/charts/hazelcast`) → `kubernetes` service\n3. **gw-metrics** (`app-common/charts/otel-collector/charts/gw-metrics`) → `argo-rollouts-metrics` service\n4. **redis** (`app-common/charts/redis`) → `headless` service",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/contracts/index.md",
|
||
|
|
"diagrams/helm-interactions.mmd",
|
||
|
|
"reference/system-architecture.md",
|
||
|
|
"reference/subsystems/app-common.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/contracts/index.md",
|
||
|
|
"diagrams/helm-interactions.mmd",
|
||
|
|
"reference/system-architecture.md",
|
||
|
|
"reference/subsystems/app-common.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 5,
|
||
|
|
"completeness": 5,
|
||
|
|
"precision": 5,
|
||
|
|
"navigation": 5,
|
||
|
|
"notes": "AI answer matches ground truth exactly on all four chart-service pairs with correct paths. Minor stylistic addition of 'service' descriptor after each service name adds clarity without introducing errors. No hallucinations or missing items detected."
|
||
|
|
},
|
||
|
|
"composite": 100,
|
||
|
|
"notFound": false
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "interaction-002",
|
||
|
|
"category": "interaction",
|
||
|
|
"difficulty": "hard",
|
||
|
|
"question": "Which network ports are used by multiple Helm charts? List each port and the charts that expose it.",
|
||
|
|
"groundTruth": "Port 443: cluster, twistlock-defender, eck-operator, twistlock\nPort 8889: gw-metrics, gw-traces, kube-state-metrics\nPort 9200: eck-operator, elasticsearch\nPort 13133: agent, gw-metrics, gw-traces, k8s-metrics, prom-scraper, elasticsearch",
|
||
|
|
"llmAnswer": "Based on the Helm charts documentation provided, here are the network ports used by multiple charts:\n\n**Port 443:**\n- cluster (app-common/apps-provider/addons/charts/common/cluster)\n- twistlock-defender (app-common/apps-provider/addons/charts/common/twistlock)\n- eck-operator (app-common/apps-provider/addons/charts/state-apps/eck-operator)\n\nThis is the only port that appears across multiple Helm chart service definitions in the provided documentation.",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md",
|
||
|
|
"reference/system-architecture.md",
|
||
|
|
"diagrams/helm-interactions.mmd",
|
||
|
|
"reference/subsystems/network-common.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md",
|
||
|
|
"reference/system-architecture.md",
|
||
|
|
"diagrams/helm-interactions.mmd",
|
||
|
|
"reference/subsystems/network-common.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 2,
|
||
|
|
"completeness": 1,
|
||
|
|
"precision": 1,
|
||
|
|
"navigation": 1,
|
||
|
|
"notes": "AI identified port 443 with 3 of 4 correct charts but missed 'twistlock' chart. More critically, it completely failed to identify ports 8889, 9200, and 13133 that are used by multiple charts per ground truth. The claim that port 443 is 'the only port' is factually incorrect and contradicts the ground truth. The agent's search was insufficient in scope and comprehensiveness."
|
||
|
|
},
|
||
|
|
"composite": 25,
|
||
|
|
"notFound": false
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "resource-001",
|
||
|
|
"category": "resource",
|
||
|
|
"difficulty": "medium",
|
||
|
|
"question": "What are the 10 most common Kubernetes resource types generated across all Foxtrot Helm charts?",
|
||
|
|
"groundTruth": "ClusterRole: 51\nServiceAccount: 49\nSecret: 43\nExternalSecret: 38\nService: 36\nRole: 30\nClusterRoleBinding: 29\nDeployment: 29\nConfigMap: 25\nRoleBinding: 25",
|
||
|
|
"llmAnswer": "\n\nBased on the \"Kubernetes Resource Types (across all charts)\" table in reference/system-architecture.md, the 10 most common Kubernetes resource types are:\n\n1. ClusterRole - 51\n2. ServiceAccount - 49\n3. Secret - 43\n4. ExternalSecret - 38\n5. Service - 36\n6. Role - 30\n7. ClusterRoleBinding - 29\n8. Deployment - 29\n9. ConfigMap - 25\n10. RoleBinding - 25",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/contracts/index.md",
|
||
|
|
"diagrams/all-contracts.mmd",
|
||
|
|
"reference/system-architecture.md",
|
||
|
|
"reference/subsystems/app-common.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/contracts/index.md",
|
||
|
|
"diagrams/all-contracts.mmd",
|
||
|
|
"reference/system-architecture.md",
|
||
|
|
"reference/subsystems/app-common.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 5,
|
||
|
|
"completeness": 5,
|
||
|
|
"precision": 5,
|
||
|
|
"navigation": 5,
|
||
|
|
"notes": "Perfect match to ground truth. All 10 resource types listed with correct counts and rankings in proper order. Agent cited the specific source file and table, demonstrating successful information retrieval."
|
||
|
|
},
|
||
|
|
"composite": 100,
|
||
|
|
"notFound": false
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "resource-mdm-app-001",
|
||
|
|
"category": "resource",
|
||
|
|
"difficulty": "medium",
|
||
|
|
"question": "What Kubernetes resource types does the mdm-app chart generate? List each type and count.",
|
||
|
|
"groundTruth": "Secret: 5\nJob: 4\nDeployment: 2\nPersistentVolumeClaim: 2\nService: 2\nExternalSecret: 2\nHorizontalPodAutoscaler: 1\nRollout: 1\nPersistentVolume: 1\nPodDisruptionBudget: 1\nServiceAccount: 1",
|
||
|
|
"llmAnswer": "Based on the mdm-app chart documentation, here are the Kubernetes resource types generated:\n\n- HorizontalPodAutoscaler: 1\n- Deployment: 2\n- Rollout: 1\n- Job: 4\n- PersistentVolume: 1\n- PersistentVolumeClaim: 2\n- Service: 2\n- PodDisruptionBudget: 1\n- Secret: 5\n- ExternalSecret: 2\n- ServiceAccount: 1\n\n**Total: 11 resource types**",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/charts/app-common-charts-mdm-app.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/contracts/index.md",
|
||
|
|
"reference/subsystems/app-common.md",
|
||
|
|
"diagrams/app-common-contracts.mmd"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/charts/app-common-charts-mdm-app.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/contracts/index.md",
|
||
|
|
"reference/subsystems/app-common.md",
|
||
|
|
"diagrams/app-common-contracts.mmd"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 5,
|
||
|
|
"completeness": 5,
|
||
|
|
"precision": 5,
|
||
|
|
"navigation": 5,
|
||
|
|
"notes": "Perfect match to ground truth. All 11 resource types listed with exact counts. No hallucinations, no omissions, no errors. Clean formatting and presentation."
|
||
|
|
},
|
||
|
|
"composite": 100,
|
||
|
|
"notFound": false
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "resource-cassandra-001",
|
||
|
|
"category": "resource",
|
||
|
|
"difficulty": "medium",
|
||
|
|
"question": "What Kubernetes resource types does the cassandra chart generate? List each type and count.",
|
||
|
|
"groundTruth": "Service: 2\nStatefulSet: 1\nExternalSecret: 1\nClusterSecretStore: 1\nConfigMap: 1\nSecret: 1\nServiceAccount: 1\nRole: 1\nRoleBinding: 1\nClusterRole: 1\nClusterRoleBinding: 1\nCronJob: 1\nJob: 1",
|
||
|
|
"llmAnswer": "Based on the cassandra chart documentation, here are the Kubernetes resource types generated:\n\n- Service: 2\n- StatefulSet: 1\n- ExternalSecret: 1\n- ClusterSecretStore: 1\n- ConfigMap: 1\n- Secret: 1\n- ServiceAccount: 1\n- Role: 1\n- RoleBinding: 1\n- ClusterRole: 1\n- ClusterRoleBinding: 1\n- CronJob: 1\n- Job: 1\n\n**Total: 13 resource types (14 resources)**",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-state-cluster-cassandra.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/contracts/index.md",
|
||
|
|
"reference/subsystems/app-common.md",
|
||
|
|
"diagrams/all-contracts.mmd"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/charts/app-common-apps-provider-addons-charts-state-cluster-cassandra.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/contracts/index.md",
|
||
|
|
"reference/subsystems/app-common.md",
|
||
|
|
"diagrams/all-contracts.mmd"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 5,
|
||
|
|
"completeness": 5,
|
||
|
|
"precision": 5,
|
||
|
|
"navigation": 5,
|
||
|
|
"notes": "AI answer matches ground truth exactly across all 13 resource types with correct counts. No hallucinations, omissions, or errors detected. Perfect alignment with expected output."
|
||
|
|
},
|
||
|
|
"composite": 100,
|
||
|
|
"notFound": false
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "resource-jenkins-001",
|
||
|
|
"category": "resource",
|
||
|
|
"difficulty": "medium",
|
||
|
|
"question": "What Kubernetes resource types does the jenkins chart generate? List each type and count.",
|
||
|
|
"groundTruth": "ConfigMap: 5\nService: 3\nSecret: 3\nIngress: 2\nServiceAccount: 2\nPersistentVolumeClaim: 1\nSecurityGroupPolicy: 1\nPrometheusRule: 1\nBackendConfig: 1\nNetworkPolicy: 1\nPodDisruptionBudget: 1\nPodMonitoring: 1\nRoute: 1\nServiceMonitor: 1\nStatefulSet: 1\nRole: 1\nRoleBinding: 1\nSecretClaim: 1\nPod: 1",
|
||
|
|
"llmAnswer": "Based on the \"Resources Generated\" section in the Jenkins chart documentation:\n\n- ConfigMap: 5\n- Service: 3\n- Secret: 3\n- Ingress: 2\n- ServiceAccount: 2\n- PersistentVolumeClaim: 1\n- SecurityGroupPolicy: 1\n- PrometheusRule: 1\n- BackendConfig: 1\n- NetworkPolicy: 1\n- PodDisruptionBudget: 1\n- PodMonitoring: 1\n- Route: 1\n- ServiceMonitor: 1\n- StatefulSet: 1\n- Role: 1\n- RoleBinding: 1\n- SecretClaim: 1\n- Pod: 1",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/charts/app-common-charts-jenkins.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/contracts/index.md",
|
||
|
|
"reference/subsystems/app-common.md",
|
||
|
|
"reference/system-architecture.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/charts/app-common-charts-jenkins.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/contracts/index.md",
|
||
|
|
"reference/subsystems/app-common.md",
|
||
|
|
"reference/system-architecture.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 5,
|
||
|
|
"completeness": 5,
|
||
|
|
"precision": 5,
|
||
|
|
"navigation": 5,
|
||
|
|
"notes": "AI answer matches ground truth exactly across all 19 resource types with correct counts. No hallucinations, no missing items. Proper citation of source documentation suggests successful navigation to correct information."
|
||
|
|
},
|
||
|
|
"composite": 100,
|
||
|
|
"notFound": false
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "cross-sub-001",
|
||
|
|
"category": "cross-subsystem",
|
||
|
|
"difficulty": "hard",
|
||
|
|
"question": "Which subsystems depend on app-tools, and how heavily (by call+import count)?",
|
||
|
|
"groundTruth": "ipam-tools: 6\ncompute-tools: 4\napp-common: 1",
|
||
|
|
"llmAnswer": "Based on the Cross-Subsystem Dependencies table in reference/system-architecture.md:\n\n- **ipam-tools**: 6 calls + 0 imports = **6 total**\n- **compute-tools**: 4 calls + 0 imports = **4 total**\n- **app-common**: 1 call + 0 imports = **1 total**",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/contracts/index.md",
|
||
|
|
"reference/subsystems/app-tools.md",
|
||
|
|
"reference/system-architecture.md",
|
||
|
|
"reference/subsystems/runtime.md",
|
||
|
|
"reference/helm/index.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/contracts/index.md",
|
||
|
|
"reference/subsystems/app-tools.md",
|
||
|
|
"reference/system-architecture.md",
|
||
|
|
"reference/subsystems/runtime.md",
|
||
|
|
"reference/helm/index.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 5,
|
||
|
|
"completeness": 5,
|
||
|
|
"precision": 5,
|
||
|
|
"navigation": 5,
|
||
|
|
"notes": "Answer matches ground truth exactly on all three subsystems with correct dependency counts. Agent demonstrated clear navigation by citing the specific source file and table. Breakdown of calls vs imports adds helpful detail without introducing errors or hallucinations."
|
||
|
|
},
|
||
|
|
"composite": 100,
|
||
|
|
"notFound": false
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "arch-002",
|
||
|
|
"category": "architectural",
|
||
|
|
"difficulty": "easy",
|
||
|
|
"question": "What is the current version and appVersion of the mdm-app Helm chart?",
|
||
|
|
"groundTruth": "version: 0.7.4, appVersion: 2025.12-12-develop",
|
||
|
|
"llmAnswer": "**Version:** 0.7.4 \n**App Version:** 2025.12-12-develop",
|
||
|
|
"filesRead": [
|
||
|
|
"reference/helm/charts/app-common-charts-mdm-app.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md"
|
||
|
|
],
|
||
|
|
"plannedFiles": [
|
||
|
|
"reference/helm/charts/app-common-charts-mdm-app.md",
|
||
|
|
"reference/helm/index.md",
|
||
|
|
"reference/subsystems/app-common.md"
|
||
|
|
],
|
||
|
|
"score": {
|
||
|
|
"accuracy": 5,
|
||
|
|
"completeness": 5,
|
||
|
|
"precision": 5,
|
||
|
|
"navigation": 5,
|
||
|
|
"notes": "AI provided both required values (version 0.7.4 and appVersion 2025.12-12-develop) with exact accuracy. No hallucinations or extraneous information. The correct values indicate successful location of the Helm chart metadata."
|
||
|
|
},
|
||
|
|
"composite": 100,
|
||
|
|
"notFound": false
|
||
|
|
}
|
||
|
|
]
|
||
|
|
}
|