feat: confluence benchmark, pattern extractor, agent KB, UX spec
- extract-patterns.js: mines layered arch, ArgoCD appsets, cloud regions, CIDR allocations, naming conventions, sync waves, tech stack from code - agent-kb.js: token-efficient JSON rendering of same doc tree - eval-confluence-ref-questions.json: 32 reference-only benchmark questions - wiggum-v2.sh: Ralph Wiggum loop targeting confluence baseline (77.8%) - docs/human-ux-spec.md: BMad UX designer spec for human doc structure - Eval results: V2 at 28.7% vs confluence 77.8% baseline - Hub/spoke ownership now correctly extracted (95% on that question) - Naming conventions, regions, CIDRs surfaced in system-architecture.md
This commit is contained in:
719
eval-agent-report-v6.json
Normal file
719
eval-agent-report-v6.json
Normal file
@@ -0,0 +1,719 @@
|
||||
{
|
||||
"evalType": "agent",
|
||||
"timestamp": "2026-03-10T04:26:26.008Z",
|
||||
"docsDir": "./foxtrot-docs",
|
||||
"totalQuestions": 22,
|
||||
"overallScore": "91.4",
|
||||
"avgAccuracy": "4.73",
|
||||
"avgCompleteness": "4.45",
|
||||
"avgPrecision": "5.00",
|
||||
"avgNavigation": "4.09",
|
||||
"notFoundCount": 1,
|
||||
"notFoundRate": "4.5%",
|
||||
"byCategory": {
|
||||
"structural": {
|
||||
"avg": "96.7",
|
||||
"count": 3
|
||||
},
|
||||
"dependency": {
|
||||
"avg": "93.0",
|
||||
"count": 5
|
||||
},
|
||||
"contract": {
|
||||
"avg": "25.0",
|
||||
"count": 1
|
||||
},
|
||||
"configuration": {
|
||||
"avg": "97.0",
|
||||
"count": 5
|
||||
},
|
||||
"interaction": {
|
||||
"avg": "80.0",
|
||||
"count": 2
|
||||
},
|
||||
"resource": {
|
||||
"avg": "96.3",
|
||||
"count": 4
|
||||
},
|
||||
"cross-subsystem": {
|
||||
"avg": "100.0",
|
||||
"count": 1
|
||||
},
|
||||
"architectural": {
|
||||
"avg": "100.0",
|
||||
"count": 1
|
||||
}
|
||||
},
|
||||
"byDifficulty": {
|
||||
"easy": {
|
||||
"avg": "97.5",
|
||||
"count": 8
|
||||
},
|
||||
"medium": {
|
||||
"avg": "89.2",
|
||||
"count": 12
|
||||
},
|
||||
"hard": {
|
||||
"avg": "80.0",
|
||||
"count": 2
|
||||
}
|
||||
},
|
||||
"results": [
|
||||
{
|
||||
"id": "struct-001",
|
||||
"category": "structural",
|
||||
"difficulty": "easy",
|
||||
"question": "How many Helm charts are defined across the Foxtrot monorepo?",
|
||||
"groundTruth": "76",
|
||||
"llmAnswer": "76",
|
||||
"filesRead": [
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/subsystems/runtime.md",
|
||||
"reference/system-architecture.md",
|
||||
"reference/contracts/index.md"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/subsystems/runtime.md",
|
||||
"reference/system-architecture.md",
|
||||
"reference/contracts/index.md"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 5,
|
||||
"completeness": 5,
|
||||
"precision": 5,
|
||||
"navigation": 5,
|
||||
"notes": "AI provided the exact correct answer (76) matching ground truth. No extraneous information or hallucinations present. Correct answer indicates successful navigation to relevant Helm chart definitions in the monorepo."
|
||||
},
|
||||
"composite": 100,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "struct-002",
|
||||
"category": "structural",
|
||||
"difficulty": "medium",
|
||||
"question": "Which 5 Helm charts produce the most Kubernetes resources? List them with their resource counts.",
|
||||
"groundTruth": "cluster (app-common/apps-provider/addons/charts/common/cluster): 55\ncluster (app-common/charts/cluster): 32\njenkins (app-common/charts/jenkins): 29\nkube-state-metrics (app-common/charts/otel-collector/charts/kube-state-metrics): 27\nfoxtrot-eks (compute-common/templates/foxtrot-eks): 24",
|
||||
"llmAnswer": "1. cluster (`app-common/apps-provider/addons/charts/common/cluster`) - 55 resources\n2. cluster (`app-common/charts/cluster`) - 32 resources\n3. jenkins (`app-common/charts/jenkins`) - 29 resources\n4. kube-state-metrics (`app-common/charts/otel-collector/charts/kube-state-metrics`) - 27 resources\n5. foxtrot-eks (`compute-common/templates/foxtrot-eks`) - 24 resources",
|
||||
"filesRead": [
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/subsystems/runtime.md",
|
||||
"reference/system-architecture.md"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/subsystems/runtime.md",
|
||||
"reference/system-architecture.md"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 5,
|
||||
"completeness": 5,
|
||||
"precision": 5,
|
||||
"navigation": 4,
|
||||
"notes": "AI answer matches ground truth exactly across all 5 items with correct resource counts and paths in proper ranking order. No hallucinations or extraneous information. Navigation score is 4 rather than 5 because process visibility is limited, but the specificity and accuracy of results strongly indicate effective information retrieval."
|
||||
},
|
||||
"composite": 95,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "struct-003",
|
||||
"category": "structural",
|
||||
"difficulty": "easy",
|
||||
"question": "How many subsystems does the Foxtrot codebase contain?",
|
||||
"groundTruth": "12",
|
||||
"llmAnswer": "\n\n12",
|
||||
"filesRead": [
|
||||
"reference/system-architecture.md",
|
||||
"reference/subsystems/root.md",
|
||||
"reference/subsystems/app-common.md"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/system-architecture.md",
|
||||
"reference/subsystems/root.md",
|
||||
"reference/subsystems/app-common.md"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 5,
|
||||
"completeness": 5,
|
||||
"precision": 5,
|
||||
"navigation": 4,
|
||||
"notes": "AI provided exact correct answer matching ground truth. Answer is concise with no hallucinations or extraneous information. Navigation scored 4 rather than 5 due to lack of visible process documentation showing direct path to source."
|
||||
},
|
||||
"composite": 95,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "dep-external-dns-001",
|
||||
"category": "dependency",
|
||||
"difficulty": "medium",
|
||||
"question": "What are the dependencies of the \"external-dns\" chart (at app-common/apps-provider/addons/charts/common/external-dns)?",
|
||||
"groundTruth": "external-dns (1.17.0) [condition: external-dns.enabled]",
|
||||
"llmAnswer": "\n\nBased on the documentation provided, the dependencies of the \"external-dns\" chart at `app-common/apps-provider/addons/charts/common/external-dns` are:\n\n- **external-dns** (1.17.0) *if external-dns.enabled*",
|
||||
"filesRead": [
|
||||
"reference/helm/charts/app-common-apps-provider-addons-charts-common-external-dns.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/contracts/index.md",
|
||||
"diagrams/app-common-contracts.mmd"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/charts/app-common-apps-provider-addons-charts-common-external-dns.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/contracts/index.md",
|
||||
"diagrams/app-common-contracts.mmd"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 5,
|
||||
"completeness": 5,
|
||||
"precision": 5,
|
||||
"navigation": 3,
|
||||
"notes": "AI answer correctly identifies the single dependency (external-dns 1.17.0) and its condition (external-dns.enabled). Information is accurate and complete with no hallucinations. Formatting is appropriate for documentation. Navigation score is moderate because while the answer is correct, the response doesn't demonstrate the process of locating the chart file or its Chart.yaml/values.yaml to verify the dependency information."
|
||||
},
|
||||
"composite": 90,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "dep-ingress-nginx-001",
|
||||
"category": "dependency",
|
||||
"difficulty": "medium",
|
||||
"question": "What are the dependencies of the \"ingress-nginx\" chart (at app-common/apps-provider/addons/charts/common/ingress-nginx)?",
|
||||
"groundTruth": "ingress-nginx (4.12.0) [condition: ingress-nginx.enabled]",
|
||||
"llmAnswer": "\n\nBased on the documentation file for the ingress-nginx chart at `app-common/apps-provider/addons/charts/common/ingress-nginx`, the dependencies are:\n\n- **ingress-nginx** (4.12.0) *if ingress-nginx.enabled*",
|
||||
"filesRead": [
|
||||
"reference/helm/charts/app-common-apps-provider-addons-charts-common-ingress-nginx.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"diagrams/app-common-contracts.mmd"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/charts/app-common-apps-provider-addons-charts-common-ingress-nginx.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"diagrams/app-common-contracts.mmd"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 5,
|
||||
"completeness": 5,
|
||||
"precision": 5,
|
||||
"navigation": 5,
|
||||
"notes": "AI answer correctly identifies the single dependency (ingress-nginx 4.12.0) with the conditional flag. All facts match ground truth. No hallucinations or extraneous information. Agent explicitly cited the correct file path, demonstrating proper navigation."
|
||||
},
|
||||
"composite": 100,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "dep-kube-state-metrics-001",
|
||||
"category": "dependency",
|
||||
"difficulty": "medium",
|
||||
"question": "What are the dependencies of the \"kube-state-metrics\" chart (at app-common/apps-provider/addons/charts/common/kube-state-metrics)?",
|
||||
"groundTruth": "kube-state-metrics (5.26.0) [condition: kube-state-metrics.enabled]",
|
||||
"llmAnswer": "\n\nBased on the documentation provided, the dependencies of the \"kube-state-metrics\" chart at `app-common/apps-provider/addons/charts/common/kube-state-metrics` are:\n\n- kube-state-metrics (5.26.0) - conditional on `kube-state-metrics.enabled`",
|
||||
"filesRead": [
|
||||
"reference/helm/charts/app-common-apps-provider-addons-charts-common-kube-state-metrics.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"diagrams/helm-interactions.mmd"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/charts/app-common-apps-provider-addons-charts-common-kube-state-metrics.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"diagrams/helm-interactions.mmd"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 5,
|
||||
"completeness": 5,
|
||||
"precision": 5,
|
||||
"navigation": 4,
|
||||
"notes": "AI answer correctly identifies the single dependency (kube-state-metrics 5.26.0) with the proper condition flag. Formatting differs slightly from ground truth (prose vs. bracket notation) but conveys identical information. No hallucinations or extraneous details. Navigation score reduced from 5 only because actual file discovery process isn't visible, though correctness strongly indicates proper source location."
|
||||
},
|
||||
"composite": 95,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "dep-open-telemetry-001",
|
||||
"category": "dependency",
|
||||
"difficulty": "medium",
|
||||
"question": "What are the dependencies of the \"open-telemetry\" chart (at app-common/apps-provider/addons/charts/common/otel-collector)?",
|
||||
"groundTruth": "opentelemetry-operator (0.26.4) [condition: opentelemetry-operator.enabled]\nk8s-metrics (0.1.0) [condition: k8s-metrics.enabled]\nagent (0.1.1) [condition: agent.enabled]\ncommon (0.1.0) [condition: common.enabled]\ngw-metrics (0.1.0) [condition: gw-metrics.enabled]\ngw-traces (0.1.0) [condition: gw-traces.enabled]\nprom-scraper (0.1.0) [condition: prom-scraper.enabled]",
|
||||
"llmAnswer": "Based on the documentation provided, the dependencies of the \"open-telemetry\" chart are:\n\n- opentelemetry-operator (0.26.4)\n- k8s-metrics (0.1.0)\n- agent (0.1.1)\n- common (0.1.0)\n- gw-metrics (0.1.0)\n- gw-traces (0.1.0)\n- prom-scraper (0.1.0)",
|
||||
"filesRead": [
|
||||
"reference/helm/charts/app-common-apps-provider-addons-charts-common-otel-collector.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/contracts/index.md",
|
||||
"diagrams/app-common-contracts.mmd"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/charts/app-common-apps-provider-addons-charts-common-otel-collector.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/contracts/index.md",
|
||||
"diagrams/app-common-contracts.mmd"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 5,
|
||||
"completeness": 3,
|
||||
"precision": 5,
|
||||
"navigation": 4,
|
||||
"notes": "AI correctly identified all 7 dependencies with accurate names and versions. However, it omitted the conditional metadata ([condition: ...]) that is part of the ground truth specification. This represents incomplete coverage of the full dependency information, though the core dependency list is accurate and complete."
|
||||
},
|
||||
"composite": 85,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "dep-reltio-prometheus-001",
|
||||
"category": "dependency",
|
||||
"difficulty": "medium",
|
||||
"question": "What are the dependencies of the \"reltio-prometheus\" chart (at app-common/apps-provider/addons/charts/common/prometheus)?",
|
||||
"groundTruth": "kube-prometheus-stack (69.4.1) [condition: kube-prometheus-stack.enabled]\nprometheus-adapter (4.11.0) [condition: prometheus-adapter.enabled]",
|
||||
"llmAnswer": "- kube-prometheus-stack (69.4.1)\n- prometheus-adapter (4.11.0)",
|
||||
"filesRead": [
|
||||
"reference/helm/charts/app-common-apps-provider-addons-charts-common-prometheus.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/contracts/index.md",
|
||||
"diagrams/app-common-contracts.mmd"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/charts/app-common-apps-provider-addons-charts-common-prometheus.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/contracts/index.md",
|
||||
"diagrams/app-common-contracts.mmd"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 5,
|
||||
"completeness": 4,
|
||||
"precision": 5,
|
||||
"navigation": 5,
|
||||
"notes": "AI correctly identified both dependencies with accurate versions. Omitted conditional metadata ([condition: ...]) that appears in ground truth, which represents incomplete capture of dependency specifications. Core dependency facts are accurate and no hallucinations present."
|
||||
},
|
||||
"composite": 95,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "contract-001",
|
||||
"category": "contract",
|
||||
"difficulty": "medium",
|
||||
"question": "Which secrets or ConfigMaps are shared across multiple Helm charts? List each with the charts that use it.",
|
||||
"groundTruth": "logdna-agent-key: cluster\nlogdna-agent-tags: cluster\nvault-secret: ai-app, cleanse-app, pms-app, rdm-app, shared-app\nvault-secret-rdm-tasks: ai-app, cleanse-app, pms-app, rdm-app, shared-app",
|
||||
"llmAnswer": "NOT_FOUND\n\nThe provided documentation does not contain explicit information about which specific secrets or ConfigMaps are shared across multiple Helm charts. While the contracts section references some external dependencies (such as `vault-secret`, `vault-secret-rdm-tasks`, `logdna-agent-key`, and `logdna-agent-tags`), the documentation does not provide a comprehensive mapping of shared secrets/ConfigMaps to the charts that consume them.",
|
||||
"filesRead": [
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/system-architecture.md",
|
||||
"explanation/change-impact.md"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/system-architecture.md",
|
||||
"explanation/change-impact.md"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 0,
|
||||
"completeness": 0,
|
||||
"precision": 5,
|
||||
"navigation": 0,
|
||||
"notes": "AI answered NOT_FOUND when ground truth contains explicit mappings of 4 shared secrets/ConfigMaps across 5 Helm charts. The agent failed to locate or extract the required information from available documentation. No hallucinations present, but complete failure to answer the question."
|
||||
},
|
||||
"composite": 25,
|
||||
"notFound": true
|
||||
},
|
||||
{
|
||||
"id": "config-mdm-app-count",
|
||||
"category": "configuration",
|
||||
"difficulty": "easy",
|
||||
"question": "How many top-level configuration keys does the mdm-app chart (at app-common/charts/mdm-app) expose in its values.yaml?",
|
||||
"groundTruth": "38",
|
||||
"llmAnswer": "38",
|
||||
"filesRead": [
|
||||
"reference/helm/charts/app-common-charts-mdm-app.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/charts/app-common-charts-mdm-app.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 5,
|
||||
"completeness": 5,
|
||||
"precision": 5,
|
||||
"navigation": 5,
|
||||
"notes": "AI provided exact correct count (38) matching ground truth. Answer is factually accurate, complete, and free of hallucination. Agent successfully located and analyzed the correct values.yaml file."
|
||||
},
|
||||
"composite": 100,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "config-ai-app-count",
|
||||
"category": "configuration",
|
||||
"difficulty": "easy",
|
||||
"question": "How many top-level configuration keys does the ai-app chart (at app-common/charts/ai-app) expose in its values.yaml?",
|
||||
"groundTruth": "34",
|
||||
"llmAnswer": "34",
|
||||
"filesRead": [
|
||||
"reference/helm/charts/app-common-charts-ai-app.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/charts/app-common-charts-ai-app.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 5,
|
||||
"completeness": 5,
|
||||
"precision": 5,
|
||||
"navigation": 4,
|
||||
"notes": "AI provided the exact correct count of 34 top-level keys. Answer is factually accurate, complete for the question asked, and contains no extraneous information. Navigation score is 4 rather than 5 because while the correct answer indicates the agent found the right file, the evaluation lacks explicit evidence of the navigation process or steps taken to locate and analyze the values.yaml file."
|
||||
},
|
||||
"composite": 95,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "config-elasticsearch-count",
|
||||
"category": "configuration",
|
||||
"difficulty": "easy",
|
||||
"question": "How many top-level configuration keys does the elasticsearch chart (at app-common/apps-provider/addons/charts/state-apps/es) expose in its values.yaml?",
|
||||
"groundTruth": "32",
|
||||
"llmAnswer": "32",
|
||||
"filesRead": [
|
||||
"reference/helm/charts/app-common-apps-provider-addons-charts-state-apps-es.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"reference/subsystems/app-common.md"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/charts/app-common-apps-provider-addons-charts-state-apps-es.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"reference/subsystems/app-common.md"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 5,
|
||||
"completeness": 5,
|
||||
"precision": 5,
|
||||
"navigation": 5,
|
||||
"notes": "AI answer matches ground truth exactly. The agent correctly identified 32 top-level configuration keys in the elasticsearch chart's values.yaml file. No extraneous information or hallucination present. Perfect alignment with expected answer."
|
||||
},
|
||||
"composite": 100,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "config-hazelcast-count",
|
||||
"category": "configuration",
|
||||
"difficulty": "easy",
|
||||
"question": "How many top-level configuration keys does the hazelcast chart (at app-common/apps-provider/addons/charts/state-apps/hz) expose in its values.yaml?",
|
||||
"groundTruth": "32",
|
||||
"llmAnswer": "32",
|
||||
"filesRead": [
|
||||
"reference/helm/charts/app-common-apps-provider-addons-charts-state-apps-hz.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/charts/app-common-apps-provider-addons-charts-state-apps-hz.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 5,
|
||||
"completeness": 5,
|
||||
"precision": 5,
|
||||
"navigation": 4,
|
||||
"notes": "AI provided the exact correct answer (32) matching ground truth. No hallucinated information or extraneous details. Navigation score slightly lower because the response doesn't show the agent's process of locating and examining the values.yaml file, though the correct answer strongly implies successful file discovery."
|
||||
},
|
||||
"composite": 95,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "config-redis-count",
|
||||
"category": "configuration",
|
||||
"difficulty": "easy",
|
||||
"question": "How many top-level configuration keys does the redis chart (at app-common/charts/redis) expose in its values.yaml?",
|
||||
"groundTruth": "23",
|
||||
"llmAnswer": "23",
|
||||
"filesRead": [
|
||||
"reference/helm/charts/app-common-charts-redis.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/charts/app-common-charts-redis.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 5,
|
||||
"completeness": 5,
|
||||
"precision": 5,
|
||||
"navigation": 4,
|
||||
"notes": "AI answer matches ground truth exactly (23 keys). Answer is factually correct, complete, and contains no hallucinated information. Navigation score is 4 rather than 5 because the response doesn't show evidence of the agent's process in locating and analyzing the values.yaml file, though the correct answer indicates successful file discovery."
|
||||
},
|
||||
"composite": 95,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "interaction-001",
|
||||
"category": "interaction",
|
||||
"difficulty": "medium",
|
||||
"question": "Which Helm charts reference Kubernetes services from other components? List each chart and the service it calls.",
|
||||
"groundTruth": "hazelcast (app-common/apps-provider/addons/charts/state-apps/hz) → kubernetes\nhazelcast (app-common/charts/hazelcast) → kubernetes\ngw-metrics (app-common/charts/otel-collector/charts/gw-metrics) → argo-rollouts-metrics\nredis (app-common/charts/redis) → headless",
|
||||
"llmAnswer": "Based on the documentation provided, the following Helm charts reference Kubernetes services from other components:\n\n1. **hazelcast** (`app-common/apps-provider/addons/charts/state-apps/hz`) → `kubernetes` service\n2. **hazelcast** (`app-common/charts/hazelcast`) → `kubernetes` service\n3. **gw-metrics** (`app-common/charts/otel-collector/charts/gw-metrics`) → `argo-rollouts-metrics` service\n4. **redis** (`app-common/charts/redis`) → `headless` service",
|
||||
"filesRead": [
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"diagrams/helm-interactions.mmd",
|
||||
"reference/system-architecture.md",
|
||||
"explanation/change-impact.md"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"diagrams/helm-interactions.mmd",
|
||||
"reference/system-architecture.md",
|
||||
"explanation/change-impact.md"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 5,
|
||||
"completeness": 5,
|
||||
"precision": 5,
|
||||
"navigation": 5,
|
||||
"notes": "AI answer matches ground truth exactly across all four chart-to-service mappings. Paths and service names are correct. Minor addition of 'service' label after each service name is clarifying, not inaccurate. No hallucinations or missing items."
|
||||
},
|
||||
"composite": 100,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "interaction-002",
|
||||
"category": "interaction",
|
||||
"difficulty": "hard",
|
||||
"question": "Which network ports are used by multiple Helm charts? List each port and the charts that expose it.",
|
||||
"groundTruth": "Port 443: cluster, twistlock-defender, eck-operator, twistlock\nPort 8889: gw-metrics, gw-traces, kube-state-metrics\nPort 9200: eck-operator, elasticsearch\nPort 13133: agent, gw-metrics, gw-traces, k8s-metrics, prom-scraper, elasticsearch",
|
||||
"llmAnswer": "Based on the contracts documentation provided, the following network ports are used by multiple Helm charts:\n\n- **Port 443**: cluster, eck-operator, twistlock-defender\n- **Port 9200**: eck-operator, elasticsearch",
|
||||
"filesRead": [
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"diagrams/helm-interactions.mmd",
|
||||
"reference/system-architecture.md"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"diagrams/helm-interactions.mmd",
|
||||
"reference/system-architecture.md"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 4,
|
||||
"completeness": 1,
|
||||
"precision": 5,
|
||||
"navigation": 2,
|
||||
"notes": "AI correctly identified Port 443 and 9200 with accurate chart mappings, but missed Port 8889 and 13133 entirely. Port 443 entry also omits 'twistlock' from the chart list. Answer covers only 2 of 4 ports and is missing 50% of the required information. No hallucinations present, but significant gaps in discovery."
|
||||
},
|
||||
"composite": 60,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "resource-001",
|
||||
"category": "resource",
|
||||
"difficulty": "medium",
|
||||
"question": "What are the 10 most common Kubernetes resource types generated across all Foxtrot Helm charts?",
|
||||
"groundTruth": "ClusterRole: 51\nServiceAccount: 49\nSecret: 43\nExternalSecret: 38\nService: 36\nRole: 30\nClusterRoleBinding: 29\nDeployment: 29\nConfigMap: 25\nRoleBinding: 25",
|
||||
"llmAnswer": "\n\nBased on the \"Kubernetes Resource Types (across all charts)\" table in the system architecture documentation, the 10 most common Kubernetes resource types are:\n\n1. ClusterRole - 51\n2. ServiceAccount - 49\n3. Secret - 43\n4. ExternalSecret - 38\n5. Service - 36\n6. Role - 30\n7. ClusterRoleBinding - 29\n8. Deployment - 29\n9. ConfigMap - 25\n10. RoleBinding - 25",
|
||||
"filesRead": [
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"reference/system-architecture.md",
|
||||
"diagrams/all-contracts.mmd",
|
||||
"reference/subsystems/app-common.md"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"reference/system-architecture.md",
|
||||
"diagrams/all-contracts.mmd",
|
||||
"reference/subsystems/app-common.md"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 5,
|
||||
"completeness": 5,
|
||||
"precision": 5,
|
||||
"navigation": 5,
|
||||
"notes": "AI answer matches ground truth exactly across all 10 items with correct counts and ranking. Properly cited source documentation. No hallucinations or extraneous information."
|
||||
},
|
||||
"composite": 100,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "resource-mdm-app-001",
|
||||
"category": "resource",
|
||||
"difficulty": "medium",
|
||||
"question": "What Kubernetes resource types does the mdm-app chart generate? List each type and count.",
|
||||
"groundTruth": "Secret: 5\nJob: 4\nDeployment: 2\nPersistentVolumeClaim: 2\nService: 2\nExternalSecret: 2\nHorizontalPodAutoscaler: 1\nRollout: 1\nPersistentVolume: 1\nPodDisruptionBudget: 1\nServiceAccount: 1",
|
||||
"llmAnswer": "Based on the mdm-app chart documentation, here are the Kubernetes resource types generated:\n\n- HorizontalPodAutoscaler: 1\n- Deployment: 2\n- Rollout: 1\n- Job: 4\n- PersistentVolume: 1\n- PersistentVolumeClaim: 2\n- Service: 2\n- PodDisruptionBudget: 1\n- Secret: 5\n- ExternalSecret: 2\n- ServiceAccount: 1\n\n**Total: 22 resources**",
|
||||
"filesRead": [
|
||||
"reference/helm/charts/app-common-charts-mdm-app.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/contracts/index.md",
|
||||
"diagrams/app-common-contracts.mmd"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/charts/app-common-charts-mdm-app.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/contracts/index.md",
|
||||
"diagrams/app-common-contracts.mmd"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 5,
|
||||
"completeness": 5,
|
||||
"precision": 5,
|
||||
"navigation": 4,
|
||||
"notes": "All 11 resource types listed with exact counts matching ground truth. No hallucinations or extraneous information. Navigation scored 4 rather than 5 because the response lacks explicit evidence of how the information was retrieved (e.g., which files were examined in the chart), though the perfect accuracy suggests reliable source access."
|
||||
},
|
||||
"composite": 95,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "resource-cassandra-001",
|
||||
"category": "resource",
|
||||
"difficulty": "medium",
|
||||
"question": "What Kubernetes resource types does the cassandra chart generate? List each type and count.",
|
||||
"groundTruth": "Service: 2\nStatefulSet: 1\nExternalSecret: 1\nClusterSecretStore: 1\nConfigMap: 1\nSecret: 1\nServiceAccount: 1\nRole: 1\nRoleBinding: 1\nClusterRole: 1\nClusterRoleBinding: 1\nCronJob: 1\nJob: 1",
|
||||
"llmAnswer": "Based on the cassandra chart documentation, here are the Kubernetes resource types generated:\n\n- Service: 2\n- StatefulSet: 1\n- ExternalSecret: 1\n- ClusterSecretStore: 1\n- ConfigMap: 1\n- Secret: 1\n- ServiceAccount: 1\n- Role: 1\n- RoleBinding: 1\n- ClusterRole: 1\n- ClusterRoleBinding: 1\n- CronJob: 1\n- Job: 1",
|
||||
"filesRead": [
|
||||
"reference/helm/charts/app-common-apps-provider-addons-charts-state-cluster-cassandra.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/system-architecture.md"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/charts/app-common-apps-provider-addons-charts-state-cluster-cassandra.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/system-architecture.md"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 5,
|
||||
"completeness": 5,
|
||||
"precision": 5,
|
||||
"navigation": 4,
|
||||
"notes": "AI answer matches ground truth exactly across all 13 resource types and counts. No hallucinations or extra information. Navigation score reduced from 5 because the response doesn't demonstrate the actual process of finding the information (no file references, execution logs, or search path shown), though the perfect accuracy indicates successful information retrieval."
|
||||
},
|
||||
"composite": 95,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "resource-jenkins-001",
|
||||
"category": "resource",
|
||||
"difficulty": "medium",
|
||||
"question": "What Kubernetes resource types does the jenkins chart generate? List each type and count.",
|
||||
"groundTruth": "ConfigMap: 5\nService: 3\nSecret: 3\nIngress: 2\nServiceAccount: 2\nPersistentVolumeClaim: 1\nSecurityGroupPolicy: 1\nPrometheusRule: 1\nBackendConfig: 1\nNetworkPolicy: 1\nPodDisruptionBudget: 1\nPodMonitoring: 1\nRoute: 1\nServiceMonitor: 1\nStatefulSet: 1\nRole: 1\nRoleBinding: 1\nSecretClaim: 1\nPod: 1",
|
||||
"llmAnswer": "Based on the Jenkins chart documentation, here are the Kubernetes resource types generated:\n\n- ConfigMap: 5\n- Service: 3\n- Secret: 3\n- Ingress: 2\n- ServiceAccount: 2\n- PersistentVolumeClaim: 1\n- SecurityGroupPolicy: 1\n- PrometheusRule: 1\n- BackendConfig: 1\n- NetworkPolicy: 1\n- PodDisruptionBudget: 1\n- PodMonitoring: 1\n- Route: 1\n- ServiceMonitor: 1\n- StatefulSet: 1\n- Role: 1\n- RoleBinding: 1\n- SecretClaim: 1\n- Pod: 1",
|
||||
"filesRead": [
|
||||
"reference/helm/charts/app-common-charts-jenkins.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"diagrams/app-common-contracts.mmd"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/charts/app-common-charts-jenkins.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"diagrams/app-common-contracts.mmd"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 5,
|
||||
"completeness": 5,
|
||||
"precision": 5,
|
||||
"navigation": 4,
|
||||
"notes": "Answer matches ground truth exactly across all 19 resource types with correct counts. No hallucinations or errors. Navigation score reduced from 5 because the response doesn't demonstrate the process of finding this information (no file references, tool usage, or working shown), though the perfect accuracy indicates successful information retrieval."
|
||||
},
|
||||
"composite": 95,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "cross-sub-001",
|
||||
"category": "cross-subsystem",
|
||||
"difficulty": "hard",
|
||||
"question": "Which subsystems depend on app-tools, and how heavily (by call+import count)?",
|
||||
"groundTruth": "ipam-tools: 6\ncompute-tools: 4\napp-common: 1",
|
||||
"llmAnswer": "Based on the cross-subsystem dependencies table in the system-architecture.md file, the subsystems that depend on app-tools are:\n\n1. **ipam-tools**: 6 calls + 0 imports = **6 total**\n2. **compute-tools**: 4 calls + 0 imports = **4 total**\n3. **app-common**: 1 call + 0 imports = **1 total**\n\nAll dependencies on app-tools are through calls only (no imports), indicating they treat it as a service boundary for resource provisioning and credential creation operations.",
|
||||
"filesRead": [
|
||||
"reference/subsystems/app-tools.md",
|
||||
"reference/contracts/index.md",
|
||||
"reference/subsystems/runtime.md",
|
||||
"reference/system-architecture.md",
|
||||
"reference/subsystems/app-common.md"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/subsystems/app-tools.md",
|
||||
"reference/contracts/index.md",
|
||||
"reference/subsystems/runtime.md",
|
||||
"reference/system-architecture.md",
|
||||
"reference/subsystems/app-common.md"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 5,
|
||||
"completeness": 5,
|
||||
"precision": 5,
|
||||
"navigation": 5,
|
||||
"notes": "AI answer matches ground truth exactly on all three subsystems and their dependency counts. Agent correctly identified the source document (system-architecture.md), extracted precise numerical data, and provided accurate supplementary analysis about call-only dependencies without hallucination."
|
||||
},
|
||||
"composite": 100,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "arch-002",
|
||||
"category": "architectural",
|
||||
"difficulty": "easy",
|
||||
"question": "What is the current version and appVersion of the mdm-app Helm chart?",
|
||||
"groundTruth": "version: 0.7.4, appVersion: 2025.12-12-develop",
|
||||
"llmAnswer": "**Version:** 0.7.4 \n**App Version:** 2025.12-12-develop",
|
||||
"filesRead": [
|
||||
"reference/helm/charts/app-common-charts-mdm-app.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/charts/app-common-charts-mdm-app.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 5,
|
||||
"completeness": 5,
|
||||
"precision": 5,
|
||||
"navigation": 5,
|
||||
"notes": "AI provided both required values (version 0.7.4 and appVersion 2025.12-12-develop) with exact accuracy. No hallucinations or extraneous information. The correct values indicate successful location of the Helm chart metadata."
|
||||
},
|
||||
"composite": 100,
|
||||
"notFound": false
|
||||
}
|
||||
]
|
||||
}
|
||||
Reference in New Issue
Block a user