diff --git a/eval-agent-report-v2.json b/eval-agent-report-v2.json new file mode 100644 index 0000000..784c582 --- /dev/null +++ b/eval-agent-report-v2.json @@ -0,0 +1,724 @@ +{ + "evalType": "agent", + "timestamp": "2026-03-10T00:05:59.138Z", + "docsDir": "./foxtrot-docs", + "totalQuestions": 22, + "overallScore": "56.6", + "avgAccuracy": "2.36", + "avgCompleteness": "2.23", + "avgPrecision": "4.50", + "avgNavigation": "2.23", + "notFoundCount": 9, + "notFoundRate": "40.9%", + "byCategory": { + "structural": { + "avg": "68.3", + "count": 3 + }, + "dependency": { + "avg": "39.0", + "count": 5 + }, + "contract": { + "avg": "65.0", + "count": 1 + }, + "configuration": { + "avg": "66.0", + "count": 5 + }, + "interaction": { + "avg": "75.0", + "count": 2 + }, + "resource": { + "avg": "43.8", + "count": 4 + }, + "cross-subsystem": { + "avg": "100.0", + "count": 1 + }, + "architectural": { + "avg": "25.0", + "count": 1 + } + }, + "byDifficulty": { + "easy": { + "avg": "58.1", + "count": 8 + }, + "medium": { + "avg": "52.5", + "count": 12 + }, + "hard": { + "avg": "75.0", + "count": 2 + } + }, + "results": [ + { + "id": "struct-001", + "category": "structural", + "difficulty": "easy", + "question": "How many Helm charts are defined across the Foxtrot monorepo?", + "groundTruth": "76", + "llmAnswer": "76", + "filesRead": [ + "reference/helm/index.md", + "reference/system-architecture.md", + "diagrams/helm-interactions.mmd", + "reference/contracts/index.md" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/system-architecture.md", + "reference/helm/charts/", + "diagrams/helm-interactions.mmd", + "reference/contracts/index.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 5, + "notes": "AI provided the exact correct answer (76) matching ground truth. No extraneous information or hallucinations present. Correct answer indicates successful navigation to relevant Helm chart definitions in the monorepo." + }, + "composite": 100, + "notFound": false + }, + { + "id": "struct-002", + "category": "structural", + "difficulty": "medium", + "question": "Which 5 Helm charts produce the most Kubernetes resources? List them with their resource counts.", + "groundTruth": "cluster (app-common/apps-provider/addons/charts/common/cluster): 55\ncluster (app-common/charts/cluster): 32\njenkins (app-common/charts/jenkins): 29\nkube-state-metrics (app-common/charts/otel-collector/charts/kube-state-metrics): 27\nfoxtrot-eks (compute-common/templates/foxtrot-eks): 24", + "llmAnswer": "1. cluster (`app-common/apps-provider/addons/charts/common/cluster`) - 55 resources\n2. cluster (`app-common/charts/cluster`) - 32 resources\n3. jenkins - 29 resources\n4. kube-state-metrics - 27 resources\n5. foxtrot-eks - 24 resources", + "filesRead": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/system-architecture.md", + "diagrams/helm-interactions.mmd" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/system-architecture.md", + "diagrams/helm-interactions.mmd", + "reference/helm/charts/" + ], + "score": { + "accuracy": 5, + "completeness": 4, + "precision": 5, + "navigation": 5, + "notes": "All resource counts and rankings are correct. Full paths provided for first two charts but abbreviated for last three (jenkins, kube-state-metrics, foxtrot-eks lack full directory paths shown in ground truth). Core facts are accurate with no hallucinations." + }, + "composite": 95, + "notFound": false + }, + { + "id": "struct-003", + "category": "structural", + "difficulty": "easy", + "question": "How many subsystems does the Foxtrot codebase contain?", + "groundTruth": "11", + "llmAnswer": "\n\n12", + "filesRead": [ + "reference/system-architecture.md", + "reference/subsystems/root.md", + "reference/contracts/index.md", + "reference/helm/index.md" + ], + "plannedFiles": [ + "reference/system-architecture.md", + "reference/subsystems/root.md", + "reference/contracts/index.md", + "reference/helm/index.md" + ], + "score": { + "accuracy": 0, + "completeness": 0, + "precision": 1, + "navigation": 1, + "notes": "AI provided 12 subsystems instead of the correct answer of 11. Off-by-one error suggests either miscounting during analysis or hallucination of an additional subsystem. No evidence of proper source verification or navigation to authoritative documentation." + }, + "composite": 10, + "notFound": false + }, + { + "id": "dep-external-dns-001", + "category": "dependency", + "difficulty": "medium", + "question": "What are the dependencies of the \"external-dns\" chart (at app-common/apps-provider/addons/charts/common/external-dns)?", + "groundTruth": "external-dns (1.17.0) [condition: external-dns.enabled]", + "llmAnswer": "\n\nNOT_FOUND\n\nThe documentation indicates that the external-dns chart at `app-common/apps-provider/addons/charts/common/external-dns` has 1 dependency (shown in the \"All Charts\" table), but the specific name of that dependency is not detailed in the provided files.", + "filesRead": [ + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "reference/system-architecture.md", + "diagrams/app-common-contracts.mmd" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "reference/system-architecture.md", + "diagrams/app-common-contracts.mmd" + ], + "score": { + "accuracy": 0, + "completeness": 0, + "precision": 5, + "navigation": 0, + "notes": "AI responded with NOT_FOUND despite ground truth showing the dependency exists (external-dns 1.17.0 with condition). The agent acknowledged 1 dependency exists but failed to identify or retrieve the actual dependency name, version, and condition. This represents a navigation failure to locate the required information in the chart files." + }, + "composite": 25, + "notFound": true + }, + { + "id": "dep-ingress-nginx-001", + "category": "dependency", + "difficulty": "medium", + "question": "What are the dependencies of the \"ingress-nginx\" chart (at app-common/apps-provider/addons/charts/common/ingress-nginx)?", + "groundTruth": "ingress-nginx (4.12.0) [condition: ingress-nginx.enabled]", + "llmAnswer": "\n\nNOT_FOUND\n\nThe documentation shows that the ingress-nginx chart at `app-common/apps-provider/addons/charts/common/ingress-nginx` has 1 dependency (from the chart table), but the specific name of that dependency is not detailed in the provided files.", + "filesRead": [ + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "diagrams/app-common-contracts.mmd", + "reference/system-architecture.md" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "diagrams/app-common-contracts.mmd", + "reference/system-architecture.md" + ], + "score": { + "accuracy": 0, + "completeness": 0, + "precision": 5, + "navigation": 0, + "notes": "AI answered NOT_FOUND despite ground truth clearly specifying ingress-nginx (4.12.0) as a dependency with condition. Per evaluation rules, NOT_FOUND responses receive 0/0/5/0 scoring." + }, + "composite": 25, + "notFound": true + }, + { + "id": "dep-kube-state-metrics-001", + "category": "dependency", + "difficulty": "medium", + "question": "What are the dependencies of the \"kube-state-metrics\" chart (at app-common/apps-provider/addons/charts/common/kube-state-metrics)?", + "groundTruth": "kube-state-metrics (5.26.0) [condition: kube-state-metrics.enabled]", + "llmAnswer": "NOT_FOUND\n\nThe documentation index indicates that the kube-state-metrics chart at `app-common/apps-provider/addons/charts/common/kube-state-metrics` has 1 dependency, but the specific dependency details are not included in the provided reference files. The detailed chart documentation file would be needed to identify the exact dependency.", + "filesRead": [ + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "diagrams/app-common-contracts.mmd", + "reference/system-architecture.md" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "diagrams/app-common-contracts.mmd", + "reference/system-architecture.md" + ], + "score": { + "accuracy": 0, + "completeness": 0, + "precision": 5, + "navigation": 0, + "notes": "AI failed to locate the kube-state-metrics dependency (5.26.0) that exists in the chart. Returned NOT_FOUND instead of identifying the dependency with its version and condition constraint. No navigation to the correct chart documentation file was demonstrated." + }, + "composite": 25, + "notFound": true + }, + { + "id": "dep-open-telemetry-001", + "category": "dependency", + "difficulty": "medium", + "question": "What are the dependencies of the \"open-telemetry\" chart (at app-common/apps-provider/addons/charts/common/otel-collector)?", + "groundTruth": "opentelemetry-operator (0.26.4) [condition: opentelemetry-operator.enabled]\nk8s-metrics (0.1.0) [condition: k8s-metrics.enabled]\nagent (0.1.1) [condition: agent.enabled]\ncommon (0.1.0) [condition: common.enabled]\ngw-metrics (0.1.0) [condition: gw-metrics.enabled]\ngw-traces (0.1.0) [condition: gw-traces.enabled]\nprom-scraper (0.1.0) [condition: prom-scraper.enabled]", + "llmAnswer": "Based on the documentation provided, the \"open-telemetry\" chart at `app-common/apps-provider/addons/charts/common/otel-collector` has 7 dependencies:\n\n1. opentelemetry-operator\n2. agent\n3. common\n4. gw-metrics\n5. gw-traces\n6. k8s-metrics\n7. prom-scraper", + "filesRead": [ + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "diagrams/app-common-contracts.mmd" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/helm/charts/", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "diagrams/app-common-contracts.mmd" + ], + "score": { + "accuracy": 3, + "completeness": 2, + "precision": 4, + "navigation": 3, + "notes": "AI correctly identified all 7 dependency names but omitted critical details present in ground truth: version numbers (e.g., 0.26.4, 0.1.0, 0.1.1) and conditional flags (e.g., [condition: opentelemetry-operator.enabled]). The answer is structurally sound with no hallucinations, but lacks the specificity required for a complete dependency specification. Navigation appears successful given accurate name retrieval, but no evidence of the discovery process is shown." + }, + "composite": 60, + "notFound": false + }, + { + "id": "dep-reltio-prometheus-001", + "category": "dependency", + "difficulty": "medium", + "question": "What are the dependencies of the \"reltio-prometheus\" chart (at app-common/apps-provider/addons/charts/common/prometheus)?", + "groundTruth": "kube-prometheus-stack (69.4.1) [condition: kube-prometheus-stack.enabled]\nprometheus-adapter (4.11.0) [condition: prometheus-adapter.enabled]", + "llmAnswer": "Based on the documentation provided, the reltio-prometheus chart (at `app-common/apps-provider/addons/charts/common/prometheus`) has **2 dependencies**.\n\nFrom the contracts section, these dependencies are:\n- kube-prometheus-stack\n- prometheus-adapter", + "filesRead": [ + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "diagrams/app-common-contracts.mmd", + "reference/system-architecture.md" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "diagrams/app-common-contracts.mmd", + "reference/system-architecture.md" + ], + "score": { + "accuracy": 3, + "completeness": 2, + "precision": 4, + "navigation": 3, + "notes": "AI correctly identified both dependency names (kube-prometheus-stack, prometheus-adapter) but omitted critical specification details: version numbers (69.4.1, 4.11.0) and conditional flags (kube-prometheus-stack.enabled, prometheus-adapter.enabled). For Helm chart dependencies, versions and conditions are essential metadata. The answer demonstrates navigation capability but incomplete extraction of ground truth information." + }, + "composite": 60, + "notFound": false + }, + { + "id": "contract-001", + "category": "contract", + "difficulty": "medium", + "question": "Which secrets or ConfigMaps are shared across multiple Helm charts? List each with the charts that use it.", + "groundTruth": "logdna-agent-key: cluster\nlogdna-agent-tags: cluster\nvault-secret: ai-app, cleanse-app, pms-app, rdm-app, shared-app\nvault-secret-rdm-tasks: ai-app, cleanse-app, pms-app, rdm-app, shared-app", + "llmAnswer": "Based on the helm-interactions diagram in the provided documentation, the following secrets/ConfigMaps are shared across multiple Helm charts:\n\n1. **logdna-agent-key**\n - cluster (app-common/apps-provider/addons/charts/common/cluster)\n - cluster (app-common/charts/cluster)\n\n2. **logdna-agent-tags**\n - cluster (app-common/apps-provider/addons/charts/common/cluster)\n - cluster (app-common/charts/cluster)\n\n3. **vault-secret**\n - ai-app\n - cleanse-app\n - pms-app\n - rdm-app\n - shared-app\n\n4. **vault-secret-rdm-tasks**\n - ai-app\n - cleanse-app\n - pms-app\n - rdm-app\n - shared-app", + "filesRead": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "diagrams/helm-interactions.mmd", + "reference/system-architecture.md", + "reference/subsystems/app-common.md" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "diagrams/helm-interactions.mmd", + "reference/system-architecture.md", + "reference/subsystems/app-common.md" + ], + "score": { + "accuracy": 3, + "completeness": 5, + "precision": 2, + "navigation": 3, + "notes": "The answer correctly identifies all four secrets/ConfigMaps and their associations. However, it duplicates 'cluster' entries for logdna-agent-key and logdna-agent-tags (showing two different paths), when the ground truth shows each should appear once. This duplication represents hallucinated extra information not in the ground truth. The vault-secret entries are accurate and precise. The agent demonstrated ability to locate relevant documentation but may have misinterpreted the source data regarding the cluster entries." + }, + "composite": 65, + "notFound": false + }, + { + "id": "config-mdm-app-count", + "category": "configuration", + "difficulty": "easy", + "question": "How many top-level configuration keys does the mdm-app chart (at app-common/charts/mdm-app) expose in its values.yaml?", + "groundTruth": "38", + "llmAnswer": "\n\nNOT_FOUND\n\nThe provided files reference the mdm-app chart but do not include the detailed values contract or configuration for `app-common/charts/mdm-app`. The contracts file is truncated before reaching the mdm-app values definition.", + "filesRead": [ + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md" + ], + "score": { + "accuracy": 0, + "completeness": 0, + "precision": 5, + "navigation": 0, + "notes": "AI failed to locate or access the mdm-app/values.yaml file needed to count configuration keys. While the response honestly acknowledged the limitation rather than hallucinating a number, it provided zero correct information against the ground truth of 38 keys. The agent could not navigate to the required file." + }, + "composite": 25, + "notFound": true + }, + { + "id": "config-ai-app-count", + "category": "configuration", + "difficulty": "easy", + "question": "How many top-level configuration keys does the ai-app chart (at app-common/charts/ai-app) expose in its values.yaml?", + "groundTruth": "34", + "llmAnswer": "34", + "filesRead": [ + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "diagrams/app-common-contracts.mmd" + ], + "plannedFiles": [ + "reference/helm/charts/ai-app.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "diagrams/app-common-contracts.mmd" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 4, + "notes": "AI provided the exact correct count of 34 top-level keys. Answer is factually accurate, complete for the question asked, and contains no extraneous information. Navigation score is 4 rather than 5 because while the correct answer indicates the agent found the right file, the evaluation lacks explicit evidence of the navigation process or steps taken to locate and analyze the values.yaml file." + }, + "composite": 95, + "notFound": false + }, + { + "id": "config-elasticsearch-count", + "category": "configuration", + "difficulty": "easy", + "question": "How many top-level configuration keys does the elasticsearch chart (at app-common/apps-provider/addons/charts/state-apps/es) expose in its values.yaml?", + "groundTruth": "32", + "llmAnswer": "Looking at the contracts diagram in the provided documentation, the `elasticsearch_values` class for the chart at `app-common/apps-provider/addons/charts/state-apps/es` exposes the following top-level configuration keys:\n\nenvName, esClusterName, domain, lb, cloudProvider, esVersion, esMasterNodePerZoneCount, esDataNodePerZoneCount, xpkNodeCount, esZone1, esZone2, esZone3, masterStorageSize, masterStorageClass, dataStorageSize, dataStorageClass, masterJavaOpts, masterCpus, masterMems, dataJavaOpts, dataCpus, dataMems, dataVNGName, arch, onDemand, customConfig, customConfig_master, kibanaDomain, authorizationPolicy, metrics, livenessProbe, enableAnonymousAccess\n\n**32**", + "filesRead": [ + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "diagrams/app-common-contracts.mmd" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/helm/charts/", + "reference/contracts/index.md", + "diagrams/app-common-contracts.mmd" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 4, + "navigation": 4, + "notes": "AI provided the exact count (32) matching ground truth and listed all 32 keys with domain-appropriate naming conventions (cluster config, storage, resources, zones). Demonstrated correct chart location identification. Minor precision deduction due to inability to independently verify each key without direct values.yaml access, though the comprehensive list and correct count suggest high reliability." + }, + "composite": 90, + "notFound": false + }, + { + "id": "config-hazelcast-count", + "category": "configuration", + "difficulty": "easy", + "question": "How many top-level configuration keys does the hazelcast chart (at app-common/apps-provider/addons/charts/state-apps/hz) expose in its values.yaml?", + "groundTruth": "32", + "llmAnswer": "32", + "filesRead": [ + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "diagrams/app-common-contracts.mmd" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/helm/charts/", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "diagrams/app-common-contracts.mmd" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 4, + "notes": "AI provided the exact correct answer (32) matching ground truth. No hallucinated information or extraneous details. Navigation score slightly reduced because the response doesn't show explicit evidence of finding and examining the values.yaml file, though the correct answer strongly implies successful file location." + }, + "composite": 95, + "notFound": false + }, + { + "id": "config-redis-count", + "category": "configuration", + "difficulty": "easy", + "question": "How many top-level configuration keys does the redis chart (at app-common/charts/redis) expose in its values.yaml?", + "groundTruth": "23", + "llmAnswer": "\n\nNOT_FOUND\n\nThe redis chart is referenced in the Helm index, but the detailed documentation file containing its values.yaml configuration is not included in the provided reference materials.", + "filesRead": [ + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md" + ], + "score": { + "accuracy": 0, + "completeness": 0, + "precision": 5, + "navigation": 0, + "notes": "AI failed to locate or retrieve the values.yaml file from app-common/charts/redis and provided NOT_FOUND instead of the correct answer of 23 top-level configuration keys. While the response avoided hallucinating incorrect information, it demonstrates complete failure in information retrieval and navigation to the required resource." + }, + "composite": 25, + "notFound": true + }, + { + "id": "interaction-001", + "category": "interaction", + "difficulty": "medium", + "question": "Which Helm charts reference Kubernetes services from other components? List each chart and the service it calls.", + "groundTruth": "hazelcast (app-common/apps-provider/addons/charts/state-apps/hz) → kubernetes\nhazelcast (app-common/charts/hazelcast) → kubernetes\ngw-metrics (app-common/charts/otel-collector/charts/gw-metrics) → argo-rollouts-metrics\nredis (app-common/charts/redis) → headless", + "llmAnswer": "Based on the `diagrams/helm-interactions.mmd` file, the following Helm charts reference Kubernetes services from other components:\n\n1. **hazelcast** (`app-common/apps-provider/addons/charts/state-apps/hz`) → references `kubernetes` service\n2. **hazelcast** (`app-common/charts/hazelcast`) → references `kubernetes` service\n3. **gw-metrics** (`app-common/charts/otel-collector/charts/gw-metrics`) → references `argo-rollouts-metrics` service\n4. **redis** (`app-common/charts/redis`) → references `headless` service", + "filesRead": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/system-architecture.md", + "diagrams/helm-interactions.mmd", + "reference/subsystems/control-core.md" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/system-architecture.md", + "diagrams/helm-interactions.mmd", + "reference/subsystems/control-core.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 5, + "notes": "AI answer matches ground truth exactly on all four chart-service references. Correctly identified all paths and service names. Minor stylistic addition of 'service' label doesn't constitute hallucination. Demonstrated proper source navigation by referencing the helm-interactions diagram." + }, + "composite": 100, + "notFound": false + }, + { + "id": "interaction-002", + "category": "interaction", + "difficulty": "hard", + "question": "Which network ports are used by multiple Helm charts? List each port and the charts that expose it.", + "groundTruth": "Port 443: cluster, twistlock-defender, eck-operator, twistlock\nPort 8889: gw-metrics, gw-traces, kube-state-metrics\nPort 9200: eck-operator, elasticsearch\nPort 13133: agent, gw-metrics, gw-traces, k8s-metrics, prom-scraper, elasticsearch", + "llmAnswer": "Based on the contracts documentation provided, the network port used by multiple Helm charts is:\n\n- **port:443** - exposed by:\n - cluster\n - eck-operator\n - twistlock-defender", + "filesRead": [ + "reference/helm/index.md", + "reference/system-architecture.md", + "diagrams/helm-interactions.mmd", + "reference/contracts/index.md", + "reference/subsystems/network-core.md" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/system-architecture.md", + "diagrams/helm-interactions.mmd", + "reference/contracts/index.md", + "reference/subsystems/network-core.md" + ], + "score": { + "accuracy": 3, + "completeness": 1, + "precision": 4, + "navigation": 2, + "notes": "AI correctly identified port 443 and three of four associated charts, but missed 'twistlock' from that port's list. More critically, it failed to identify ports 8889, 9200, and 13133 entirely, covering only 25% of the ground truth. No hallucinations present, but navigation was insufficient to comprehensively answer the question." + }, + "composite": 50, + "notFound": false + }, + { + "id": "resource-001", + "category": "resource", + "difficulty": "medium", + "question": "What are the 10 most common Kubernetes resource types generated across all Foxtrot Helm charts?", + "groundTruth": "ClusterRole: 51\nServiceAccount: 49\nSecret: 43\nExternalSecret: 38\nService: 36\nRole: 30\nClusterRoleBinding: 29\nDeployment: 29\nConfigMap: 25\nRoleBinding: 25", + "llmAnswer": "Based on the system architecture documentation, here are the 10 most common Kubernetes resource types across all Foxtrot Helm charts:\n\n1. ClusterRole - 51\n2. ServiceAccount - 49\n3. Secret - 43\n4. ExternalSecret - 38\n5. Service - 36\n6. Role - 30\n7. ClusterRoleBinding - 29\n8. Deployment - 29\n9. ConfigMap - 25\n10. RoleBinding - 25", + "filesRead": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/system-architecture.md", + "diagrams/helm-interactions.mmd", + "reference/subsystems/app-common.md" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/system-architecture.md", + "diagrams/helm-interactions.mmd", + "reference/subsystems/app-common.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 4, + "notes": "AI answer matches ground truth exactly across all 10 items with correct rankings and counts. No hallucinations or extraneous information. Navigation score reduced slightly because the source reference ('system architecture documentation') is generic and doesn't demonstrate specific file discovery, though the perfect accuracy indicates successful information retrieval." + }, + "composite": 95, + "notFound": false + }, + { + "id": "resource-mdm-app-001", + "category": "resource", + "difficulty": "medium", + "question": "What Kubernetes resource types does the mdm-app chart generate? List each type and count.", + "groundTruth": "Secret: 5\nJob: 4\nDeployment: 2\nPersistentVolumeClaim: 2\nService: 2\nExternalSecret: 2\nHorizontalPodAutoscaler: 1\nRollout: 1\nPersistentVolume: 1\nPodDisruptionBudget: 1\nServiceAccount: 1", + "llmAnswer": "\n\nNOT_FOUND\n\nThe documentation provided indicates that the mdm-app chart generates 22 resources total, but does not include a detailed breakdown of the specific Kubernetes resource types and their counts for this chart.", + "filesRead": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "diagrams/helm-interactions.mmd", + "reference/system-architecture.md" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/helm/charts/", + "reference/contracts/index.md", + "diagrams/helm-interactions.mmd", + "reference/system-architecture.md" + ], + "score": { + "accuracy": 0, + "completeness": 0, + "precision": 5, + "navigation": 1, + "notes": "AI failed to retrieve the required resource type breakdown. While it correctly identified the total count of 22 resources, it did not provide any of the 11 specific resource types and their individual counts from the ground truth. The agent demonstrated minimal navigation capability by finding only aggregate information rather than the detailed breakdown that was available." + }, + "composite": 30, + "notFound": true + }, + { + "id": "resource-cassandra-001", + "category": "resource", + "difficulty": "medium", + "question": "What Kubernetes resource types does the cassandra chart generate? List each type and count.", + "groundTruth": "Service: 2\nStatefulSet: 1\nExternalSecret: 1\nClusterSecretStore: 1\nConfigMap: 1\nSecret: 1\nServiceAccount: 1\nRole: 1\nRoleBinding: 1\nClusterRole: 1\nClusterRoleBinding: 1\nCronJob: 1\nJob: 1", + "llmAnswer": "\n\nLooking at the cassandra chart information in the provided documentation:\n\nFrom the contracts diagram (reference/contracts/index.md), the cassandra chart generates:\n\n- Service: 2\n- StatefulSet: 1\n- CronJob: 1\n- Job: 1\n\nHowever, the chart index shows cassandra has 14 total resources, but the detailed breakdown for the remaining 9 resources is not provided in the documentation files above.\n\n**NOT_FOUND** - Complete resource type breakdown with all 14 resources is not available in the provided documentation.", + "filesRead": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "diagrams/all-contracts.mmd", + "reference/system-architecture.md" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/helm/charts/cassandra.md", + "diagrams/all-contracts.mmd", + "reference/system-architecture.md" + ], + "score": { + "accuracy": 0, + "completeness": 0, + "precision": 5, + "navigation": 0, + "notes": "AI provided partial correct information (4 of 13 resource types with accurate counts) but then claimed the complete breakdown was unavailable in documentation, effectively answering NOT_FOUND. Per scoring rules, this triggers the NOT_FOUND penalty despite partial accuracy in what was provided." + }, + "composite": 25, + "notFound": true + }, + { + "id": "resource-jenkins-001", + "category": "resource", + "difficulty": "medium", + "question": "What Kubernetes resource types does the jenkins chart generate? List each type and count.", + "groundTruth": "ConfigMap: 5\nService: 3\nSecret: 3\nIngress: 2\nServiceAccount: 2\nPersistentVolumeClaim: 1\nSecurityGroupPolicy: 1\nPrometheusRule: 1\nBackendConfig: 1\nNetworkPolicy: 1\nPodDisruptionBudget: 1\nPodMonitoring: 1\nRoute: 1\nServiceMonitor: 1\nStatefulSet: 1\nRole: 1\nRoleBinding: 1\nSecretClaim: 1\nPod: 1", + "llmAnswer": "NOT_FOUND\n\nThe documentation references the jenkins chart at `app-common/charts/jenkins` with 29 total resources, but the detailed chart documentation file that would list the specific Kubernetes resource types is not included in the provided files.", + "filesRead": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/system-architecture.md", + "diagrams/helm-interactions.mmd" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/helm/charts/", + "reference/system-architecture.md", + "diagrams/helm-interactions.mmd" + ], + "score": { + "accuracy": 0, + "completeness": 0, + "precision": 5, + "navigation": 0, + "notes": "AI answered NOT_FOUND. Per evaluation criteria, NOT_FOUND responses receive fixed scores: accuracy and completeness are 0 (no facts provided, no coverage of ground truth items), precision is 5 (no hallucinations), and navigation is 0 (failed to locate the required information source)." + }, + "composite": 25, + "notFound": true + }, + { + "id": "cross-sub-001", + "category": "cross-subsystem", + "difficulty": "hard", + "question": "Which subsystems depend on app-tools, and how heavily (by call+import count)?", + "groundTruth": "ipam-tools: 6\ncompute-tools: 4\napp-common: 1", + "llmAnswer": "Based on the Cross-Subsystem Dependencies table in the system architecture document:\n\n- **ipam-tools**: 6 calls + 0 imports = 6 total\n- **compute-tools**: 4 calls + 0 imports = 4 total\n- **app-common**: 1 call + 0 imports = 1 total", + "filesRead": [ + "reference/contracts/index.md", + "reference/subsystems/app-tools.md", + "reference/system-architecture.md", + "reference/subsystems/app-common.md", + "diagrams/all-contracts.mmd" + ], + "plannedFiles": [ + "reference/contracts/index.md", + "reference/subsystems/app-tools.md", + "reference/system-architecture.md", + "reference/subsystems/app-common.md", + "diagrams/all-contracts.mmd" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 5, + "notes": "AI answer matches ground truth exactly on all three subsystems with correct dependency counts (ipam-tools: 6, compute-tools: 4, app-common: 1). The breakdown into calls and imports adds useful detail without introducing errors. Proper source attribution demonstrates successful navigation to the architecture documentation." + }, + "composite": 100, + "notFound": false + }, + { + "id": "arch-002", + "category": "architectural", + "difficulty": "easy", + "question": "What is the current version and appVersion of the mdm-app Helm chart?", + "groundTruth": "version: 0.7.4, appVersion: 2025.12-12-develop", + "llmAnswer": "NOT_FOUND\n\nThe provided documentation shows that mdm-app has version **0.0.2**, but the appVersion is not included in the available reference files. The detailed chart documentation at `charts/app-common-charts-mdm-app.md` would contain this information, but it was not provided.", + "filesRead": [ + "reference/helm/index.md", + "reference/system-architecture.md", + "diagrams/helm-interactions.mmd" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/helm/charts/mdm-app.md", + "reference/helm/charts/index.md", + "reference/system-architecture.md", + "diagrams/helm-interactions.mmd" + ], + "score": { + "accuracy": 0, + "completeness": 0, + "precision": 5, + "navigation": 0, + "notes": "AI answered NOT_FOUND for both version and appVersion. Ground truth specifies version 0.7.4 and appVersion 2025.12-12-develop. While the explanation mentions finding 0.0.2 (which is hallucinated), the primary answer of NOT_FOUND receives the prescribed scoring per instructions." + }, + "composite": 25, + "notFound": true + } + ] +} \ No newline at end of file diff --git a/eval-agent-report-v3.json b/eval-agent-report-v3.json new file mode 100644 index 0000000..d3a7967 --- /dev/null +++ b/eval-agent-report-v3.json @@ -0,0 +1,721 @@ +{ + "evalType": "agent", + "timestamp": "2026-03-10T00:22:59.924Z", + "docsDir": "./foxtrot-docs", + "totalQuestions": 22, + "overallScore": "84.3", + "avgAccuracy": "4.36", + "avgCompleteness": "4.27", + "avgPrecision": "4.36", + "avgNavigation": "3.86", + "notFoundCount": 0, + "notFoundRate": "0.0%", + "byCategory": { + "structural": { + "avg": "63.3", + "count": 3 + }, + "dependency": { + "avg": "89.0", + "count": 5 + }, + "contract": { + "avg": "65.0", + "count": 1 + }, + "configuration": { + "avg": "87.0", + "count": 5 + }, + "interaction": { + "avg": "62.5", + "count": 2 + }, + "resource": { + "avg": "98.8", + "count": 4 + }, + "cross-subsystem": { + "avg": "100.0", + "count": 1 + }, + "architectural": { + "avg": "100.0", + "count": 1 + } + }, + "byDifficulty": { + "easy": { + "avg": "78.8", + "count": 8 + }, + "medium": { + "avg": "91.7", + "count": 12 + }, + "hard": { + "avg": "62.5", + "count": 2 + } + }, + "results": [ + { + "id": "struct-001", + "category": "structural", + "difficulty": "easy", + "question": "How many Helm charts are defined across the Foxtrot monorepo?", + "groundTruth": "76", + "llmAnswer": "\n\n76", + "filesRead": [ + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/subsystems/runtime.md", + "reference/system-architecture.md", + "reference/contracts/index.md" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/subsystems/runtime.md", + "reference/system-architecture.md", + "reference/contracts/index.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 2, + "notes": "AI provided the exact correct answer (76), but with zero explanation or demonstration of how it found this information. No working shown, no file paths referenced, no methodology explained. While the accuracy is perfect, the complete lack of transparency into the navigation process makes it impossible to verify the agent actually located and analyzed the relevant information versus providing a memorized or lucky answer." + }, + "composite": 85, + "notFound": false + }, + { + "id": "struct-002", + "category": "structural", + "difficulty": "medium", + "question": "Which 5 Helm charts produce the most Kubernetes resources? List them with their resource counts.", + "groundTruth": "cluster (app-common/apps-provider/addons/charts/common/cluster): 55\ncluster (app-common/charts/cluster): 32\njenkins (app-common/charts/jenkins): 29\nkube-state-metrics (app-common/charts/otel-collector/charts/kube-state-metrics): 27\nfoxtrot-eks (compute-common/templates/foxtrot-eks): 24", + "llmAnswer": "1. cluster (`app-common/apps-provider/addons/charts/common/cluster`) - 55 resources\n2. cluster (`app-common/charts/cluster`) - 32 resources\n3. jenkins (`app-common/charts/jenkins`) - 29 resources\n4. kube-state-metrics (`app-common/charts/otel-collector/charts/kube-state-metrics`) - 27 resources\n5. foxtrot-eks (`compute-common/templates/foxtrot-eks`) - 24 resources", + "filesRead": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/system-architecture.md", + "reference/subsystems/app-common.md", + "reference/subsystems/runtime.md" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/system-architecture.md", + "reference/subsystems/app-common.md", + "reference/subsystems/runtime.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 4, + "notes": "AI answer matches ground truth exactly across all 5 items with correct resource counts and paths in proper ranking order. No hallucinations or extraneous information. Navigation score is 4 rather than 5 because process visibility is limited, but the specificity and accuracy of results strongly indicate effective information retrieval." + }, + "composite": 95, + "notFound": false + }, + { + "id": "struct-003", + "category": "structural", + "difficulty": "easy", + "question": "How many subsystems does the Foxtrot codebase contain?", + "groundTruth": "11", + "llmAnswer": "\n\n12", + "filesRead": [ + "reference/system-architecture.md", + "reference/subsystems/root.md", + "reference/subsystems/app-common.md", + "reference/subsystems/control-core.md", + "reference/subsystems/runtime.md" + ], + "plannedFiles": [ + "reference/system-architecture.md", + "reference/subsystems/root.md", + "reference/subsystems/app-common.md", + "reference/subsystems/control-core.md", + "reference/subsystems/runtime.md" + ], + "score": { + "accuracy": 0, + "completeness": 0, + "precision": 1, + "navigation": 1, + "notes": "AI provided 12 subsystems instead of the correct answer of 11. Off-by-one error suggests either miscounting during analysis or hallucination of an additional subsystem. No evidence of proper source verification or navigation to authoritative documentation." + }, + "composite": 10, + "notFound": false + }, + { + "id": "dep-external-dns-001", + "category": "dependency", + "difficulty": "medium", + "question": "What are the dependencies of the \"external-dns\" chart (at app-common/apps-provider/addons/charts/common/external-dns)?", + "groundTruth": "external-dns (1.17.0) [condition: external-dns.enabled]", + "llmAnswer": "Based on the documentation provided, the dependencies of the \"external-dns\" chart at `app-common/apps-provider/addons/charts/common/external-dns` are:\n\n- **external-dns** (1.17.0) *if external-dns.enabled*", + "filesRead": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-common-external-dns.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "diagrams/app-common-contracts.mmd" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-common-external-dns.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "diagrams/app-common-contracts.mmd" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 4, + "notes": "AI correctly identified the single dependency (external-dns 1.17.0) with the exact condition (external-dns.enabled). All facts match ground truth. No hallucinated information. Minor formatting difference (asterisks vs brackets) is inconsequential. Navigation score reduced slightly because response doesn't show the path taken to locate the information, though accuracy suggests successful file discovery." + }, + "composite": 95, + "notFound": false + }, + { + "id": "dep-ingress-nginx-001", + "category": "dependency", + "difficulty": "medium", + "question": "What are the dependencies of the \"ingress-nginx\" chart (at app-common/apps-provider/addons/charts/common/ingress-nginx)?", + "groundTruth": "ingress-nginx (4.12.0) [condition: ingress-nginx.enabled]", + "llmAnswer": "\n\nBased on the documentation provided, the dependencies of the \"ingress-nginx\" chart at `app-common/apps-provider/addons/charts/common/ingress-nginx` are:\n\n- ingress-nginx (4.12.0) *if ingress-nginx.enabled*", + "filesRead": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-common-ingress-nginx.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "diagrams/app-common-contracts.mmd" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-common-ingress-nginx.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "diagrams/app-common-contracts.mmd" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 4, + "notes": "AI correctly identified the ingress-nginx dependency at version 4.12.0 with the enabled condition. Answer matches ground truth exactly in content. Minor deduction on navigation because response doesn't explicitly demonstrate how the file was located, though the correct answer suggests successful navigation to the right resource." + }, + "composite": 95, + "notFound": false + }, + { + "id": "dep-kube-state-metrics-001", + "category": "dependency", + "difficulty": "medium", + "question": "What are the dependencies of the \"kube-state-metrics\" chart (at app-common/apps-provider/addons/charts/common/kube-state-metrics)?", + "groundTruth": "kube-state-metrics (5.26.0) [condition: kube-state-metrics.enabled]", + "llmAnswer": "\n\nBased on the documentation provided, the dependencies of the \"kube-state-metrics\" chart are:\n\n- kube-state-metrics (5.26.0) *if kube-state-metrics.enabled*", + "filesRead": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-common-kube-state-metrics.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "reference/system-architecture.md" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-common-kube-state-metrics.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "reference/system-architecture.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 4, + "notes": "AI correctly identified the dependency with exact version (5.26.0) and conditional requirement (kube-state-metrics.enabled). Answer matches ground truth completely with no hallucinations. Navigation score slightly lower due to lack of visible evidence of file discovery process, though correct answer indicates successful location of source material." + }, + "composite": 95, + "notFound": false + }, + { + "id": "dep-open-telemetry-001", + "category": "dependency", + "difficulty": "medium", + "question": "What are the dependencies of the \"open-telemetry\" chart (at app-common/apps-provider/addons/charts/common/otel-collector)?", + "groundTruth": "opentelemetry-operator (0.26.4) [condition: opentelemetry-operator.enabled]\nk8s-metrics (0.1.0) [condition: k8s-metrics.enabled]\nagent (0.1.1) [condition: agent.enabled]\ncommon (0.1.0) [condition: common.enabled]\ngw-metrics (0.1.0) [condition: gw-metrics.enabled]\ngw-traces (0.1.0) [condition: gw-traces.enabled]\nprom-scraper (0.1.0) [condition: prom-scraper.enabled]", + "llmAnswer": "\n\n- opentelemetry-operator (0.26.4)\n- k8s-metrics (0.1.0)\n- agent (0.1.1)\n- common (0.1.0)\n- gw-metrics (0.1.0)\n- gw-traces (0.1.0)\n- prom-scraper (0.1.0)", + "filesRead": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-common-otel-collector.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "diagrams/app-common-contracts.mmd" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-common-otel-collector.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "diagrams/app-common-contracts.mmd" + ], + "score": { + "accuracy": 5, + "completeness": 4, + "precision": 4, + "navigation": 4, + "notes": "AI correctly identified all 7 dependencies with accurate names and versions. However, the answer omits the conditional flags (e.g., '[condition: opentelemetry-operator.enabled]') that are part of the ground truth specification. This represents incomplete capture of the dependency metadata, though the core dependency information is accurate." + }, + "composite": 85, + "notFound": false + }, + { + "id": "dep-reltio-prometheus-001", + "category": "dependency", + "difficulty": "medium", + "question": "What are the dependencies of the \"reltio-prometheus\" chart (at app-common/apps-provider/addons/charts/common/prometheus)?", + "groundTruth": "kube-prometheus-stack (69.4.1) [condition: kube-prometheus-stack.enabled]\nprometheus-adapter (4.11.0) [condition: prometheus-adapter.enabled]", + "llmAnswer": "Based on the documentation for the reltio-prometheus chart, the dependencies are:\n\n- kube-prometheus-stack (69.4.1)\n- prometheus-adapter (4.11.0)", + "filesRead": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-common-prometheus.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "reference/system-architecture.md" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-common-prometheus.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "reference/system-architecture.md" + ], + "score": { + "accuracy": 5, + "completeness": 3, + "precision": 4, + "navigation": 3, + "notes": "AI correctly identified both dependencies with accurate versions, but omitted the conditional clauses ([condition: kube-prometheus-stack.enabled] and [condition: prometheus-adapter.enabled]) that are part of the ground truth. These conditions are critical metadata in Helm chart dependencies. No clear evidence of navigation to the actual Chart.yaml file shown in the response." + }, + "composite": 75, + "notFound": false + }, + { + "id": "contract-001", + "category": "contract", + "difficulty": "medium", + "question": "Which secrets or ConfigMaps are shared across multiple Helm charts? List each with the charts that use it.", + "groundTruth": "logdna-agent-key: cluster\nlogdna-agent-tags: cluster\nvault-secret: ai-app, cleanse-app, pms-app, rdm-app, shared-app\nvault-secret-rdm-tasks: ai-app, cleanse-app, pms-app, rdm-app, shared-app", + "llmAnswer": "Based on the helm-interactions diagram in the provided documentation, the following secrets/ConfigMaps are shared across multiple Helm charts:\n\n1. **logdna-agent-key**\n - cluster (app-common/apps-provider/addons/charts/common/cluster)\n - cluster (app-common/charts/cluster)\n\n2. **logdna-agent-tags**\n - cluster (app-common/apps-provider/addons/charts/common/cluster)\n - cluster (app-common/charts/cluster)\n\n3. **vault-secret**\n - ai-app\n - cleanse-app\n - pms-app\n - rdm-app\n - shared-app\n\n4. **vault-secret-rdm-tasks**\n - ai-app\n - cleanse-app\n - pms-app\n - rdm-app\n - shared-app", + "filesRead": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/subsystems/app-common.md", + "reference/system-architecture.md", + "diagrams/helm-interactions.mmd" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/subsystems/app-common.md", + "reference/system-architecture.md", + "diagrams/helm-interactions.mmd" + ], + "score": { + "accuracy": 3, + "completeness": 5, + "precision": 2, + "navigation": 3, + "notes": "The answer correctly identifies all four secrets/ConfigMaps and their associations. However, it duplicates 'cluster' entries for logdna-agent-key and logdna-agent-tags (showing two different paths), when the ground truth shows each should appear once. This duplication represents hallucinated extra information not in the ground truth. The vault-secret entries are accurate and precise. The agent demonstrated ability to locate relevant documentation but may have misinterpreted the source data regarding the cluster entries." + }, + "composite": 65, + "notFound": false + }, + { + "id": "config-mdm-app-count", + "category": "configuration", + "difficulty": "easy", + "question": "How many top-level configuration keys does the mdm-app chart (at app-common/charts/mdm-app) expose in its values.yaml?", + "groundTruth": "38", + "llmAnswer": "38", + "filesRead": [ + "reference/helm/charts/app-common-charts-mdm-app.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-charts-mdm-app.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 5, + "notes": "AI provided exact correct count (38) matching ground truth. Answer is factually accurate, complete, and free of hallucination. Agent successfully located and analyzed the correct values.yaml file." + }, + "composite": 100, + "notFound": false + }, + { + "id": "config-ai-app-count", + "category": "configuration", + "difficulty": "easy", + "question": "How many top-level configuration keys does the ai-app chart (at app-common/charts/ai-app) expose in its values.yaml?", + "groundTruth": "34", + "llmAnswer": "34", + "filesRead": [ + "reference/helm/charts/app-common-charts-ai-app.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-charts-ai-app.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 4, + "notes": "AI provided the exact correct count of 34 top-level keys. Answer is factually accurate, complete for the question asked, and contains no extraneous information. Navigation score is 4 rather than 5 because while the correct answer indicates the agent found the right file, the evaluation lacks explicit evidence of the navigation process or steps taken to arrive at the answer." + }, + "composite": 95, + "notFound": false + }, + { + "id": "config-elasticsearch-count", + "category": "configuration", + "difficulty": "easy", + "question": "How many top-level configuration keys does the elasticsearch chart (at app-common/apps-provider/addons/charts/state-apps/es) expose in its values.yaml?", + "groundTruth": "32", + "llmAnswer": "32", + "filesRead": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-state-apps-es.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-state-apps-es.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 5, + "notes": "AI answer matches ground truth exactly. The agent correctly identified 32 top-level configuration keys in the elasticsearch chart's values.yaml file. No extraneous information or hallucination present. Perfect alignment with expected answer." + }, + "composite": 100, + "notFound": false + }, + { + "id": "config-hazelcast-count", + "category": "configuration", + "difficulty": "easy", + "question": "How many top-level configuration keys does the hazelcast chart (at app-common/apps-provider/addons/charts/state-apps/hz) expose in its values.yaml?", + "groundTruth": "32", + "llmAnswer": "32", + "filesRead": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-state-apps-hz.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-state-apps-hz.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 4, + "notes": "AI provided the exact correct answer (32) matching ground truth. No hallucinated information or extraneous details. Navigation score slightly lower because the response doesn't show the agent's process of locating and examining the values.yaml file, though the correct answer strongly implies successful file discovery." + }, + "composite": 95, + "notFound": false + }, + { + "id": "config-redis-count", + "category": "configuration", + "difficulty": "easy", + "question": "How many top-level configuration keys does the redis chart (at app-common/charts/redis) expose in its values.yaml?", + "groundTruth": "23", + "llmAnswer": "\n\n22", + "filesRead": [ + "reference/helm/charts/app-common-charts-redis.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-charts-redis.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "score": { + "accuracy": 1, + "completeness": 1, + "precision": 4, + "navigation": 3, + "notes": "AI provided 22 instead of 23 top-level keys. The answer is off by exactly one, suggesting the agent likely located the correct values.yaml file but miscounted or overlooked a single configuration key. The response format is precise but the factual content is incorrect." + }, + "composite": 45, + "notFound": false + }, + { + "id": "interaction-001", + "category": "interaction", + "difficulty": "medium", + "question": "Which Helm charts reference Kubernetes services from other components? List each chart and the service it calls.", + "groundTruth": "hazelcast (app-common/apps-provider/addons/charts/state-apps/hz) → kubernetes\nhazelcast (app-common/charts/hazelcast) → kubernetes\ngw-metrics (app-common/charts/otel-collector/charts/gw-metrics) → argo-rollouts-metrics\nredis (app-common/charts/redis) → headless", + "llmAnswer": "Based on the helm-interactions diagram in the provided files, the following Helm charts reference Kubernetes services from other components:\n\n1. **hazelcast** (`app-common/apps-provider/addons/charts/state-apps/hz`) → `kubernetes` service\n2. **hazelcast** (`app-common/charts/hazelcast`) → `kubernetes` service\n3. **gw-metrics** (`app-common/charts/otel-collector/charts/gw-metrics`) → `argo-rollouts-metrics` service\n4. **redis** (`app-common/charts/redis`) → `headless` service", + "filesRead": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "diagrams/helm-interactions.mmd", + "reference/system-architecture.md", + "reference/subsystems/app-common.md" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "diagrams/helm-interactions.mmd", + "reference/system-architecture.md", + "reference/subsystems/app-common.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 5, + "notes": "AI answer matches ground truth exactly on all four chart-service pairs with correct paths. Minor stylistic addition of 'service' descriptor after each service name adds clarity without introducing errors. No hallucinations or missing items detected." + }, + "composite": 100, + "notFound": false + }, + { + "id": "interaction-002", + "category": "interaction", + "difficulty": "hard", + "question": "Which network ports are used by multiple Helm charts? List each port and the charts that expose it.", + "groundTruth": "Port 443: cluster, twistlock-defender, eck-operator, twistlock\nPort 8889: gw-metrics, gw-traces, kube-state-metrics\nPort 9200: eck-operator, elasticsearch\nPort 13133: agent, gw-metrics, gw-traces, k8s-metrics, prom-scraper, elasticsearch", + "llmAnswer": "Based on the Helm charts documentation provided, here are the network ports used by multiple charts:\n\n**Port 443:**\n- cluster (app-common/apps-provider/addons/charts/common/cluster)\n- twistlock-defender (app-common/apps-provider/addons/charts/common/twistlock)\n- eck-operator (app-common/apps-provider/addons/charts/state-apps/eck-operator)\n\nThis is the only port that appears across multiple Helm chart service definitions in the provided documentation.", + "filesRead": [ + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/system-architecture.md", + "diagrams/helm-interactions.mmd", + "reference/subsystems/network-common.md" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/system-architecture.md", + "diagrams/helm-interactions.mmd", + "reference/subsystems/network-common.md" + ], + "score": { + "accuracy": 2, + "completeness": 1, + "precision": 1, + "navigation": 1, + "notes": "AI identified port 443 with 3 of 4 correct charts but missed 'twistlock' chart. More critically, it completely failed to identify ports 8889, 9200, and 13133 that are used by multiple charts per ground truth. The claim that port 443 is 'the only port' is factually incorrect and contradicts the ground truth. The agent's search was insufficient in scope and comprehensiveness." + }, + "composite": 25, + "notFound": false + }, + { + "id": "resource-001", + "category": "resource", + "difficulty": "medium", + "question": "What are the 10 most common Kubernetes resource types generated across all Foxtrot Helm charts?", + "groundTruth": "ClusterRole: 51\nServiceAccount: 49\nSecret: 43\nExternalSecret: 38\nService: 36\nRole: 30\nClusterRoleBinding: 29\nDeployment: 29\nConfigMap: 25\nRoleBinding: 25", + "llmAnswer": "\n\nBased on the \"Kubernetes Resource Types (across all charts)\" table in reference/system-architecture.md, the 10 most common Kubernetes resource types are:\n\n1. ClusterRole - 51\n2. ServiceAccount - 49\n3. Secret - 43\n4. ExternalSecret - 38\n5. Service - 36\n6. Role - 30\n7. ClusterRoleBinding - 29\n8. Deployment - 29\n9. ConfigMap - 25\n10. RoleBinding - 25", + "filesRead": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "diagrams/all-contracts.mmd", + "reference/system-architecture.md", + "reference/subsystems/app-common.md" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "diagrams/all-contracts.mmd", + "reference/system-architecture.md", + "reference/subsystems/app-common.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 5, + "notes": "Perfect match to ground truth. All 10 resource types listed with correct counts and rankings in proper order. Agent cited the specific source file and table, demonstrating successful information retrieval." + }, + "composite": 100, + "notFound": false + }, + { + "id": "resource-mdm-app-001", + "category": "resource", + "difficulty": "medium", + "question": "What Kubernetes resource types does the mdm-app chart generate? List each type and count.", + "groundTruth": "Secret: 5\nJob: 4\nDeployment: 2\nPersistentVolumeClaim: 2\nService: 2\nExternalSecret: 2\nHorizontalPodAutoscaler: 1\nRollout: 1\nPersistentVolume: 1\nPodDisruptionBudget: 1\nServiceAccount: 1", + "llmAnswer": "Based on the mdm-app chart documentation, here are the Kubernetes resource types generated:\n\n- HorizontalPodAutoscaler: 1\n- Deployment: 2\n- Rollout: 1\n- Job: 4\n- PersistentVolume: 1\n- PersistentVolumeClaim: 2\n- Service: 2\n- PodDisruptionBudget: 1\n- Secret: 5\n- ExternalSecret: 2\n- ServiceAccount: 1\n\n**Total: 11 resource types**", + "filesRead": [ + "reference/helm/charts/app-common-charts-mdm-app.md", + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/subsystems/app-common.md", + "diagrams/app-common-contracts.mmd" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-charts-mdm-app.md", + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/subsystems/app-common.md", + "diagrams/app-common-contracts.mmd" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 5, + "notes": "Perfect match to ground truth. All 11 resource types listed with exact counts. No hallucinations, no omissions, no errors. Clean formatting and presentation." + }, + "composite": 100, + "notFound": false + }, + { + "id": "resource-cassandra-001", + "category": "resource", + "difficulty": "medium", + "question": "What Kubernetes resource types does the cassandra chart generate? List each type and count.", + "groundTruth": "Service: 2\nStatefulSet: 1\nExternalSecret: 1\nClusterSecretStore: 1\nConfigMap: 1\nSecret: 1\nServiceAccount: 1\nRole: 1\nRoleBinding: 1\nClusterRole: 1\nClusterRoleBinding: 1\nCronJob: 1\nJob: 1", + "llmAnswer": "Based on the cassandra chart documentation, here are the Kubernetes resource types generated:\n\n- Service: 2\n- StatefulSet: 1\n- ExternalSecret: 1\n- ClusterSecretStore: 1\n- ConfigMap: 1\n- Secret: 1\n- ServiceAccount: 1\n- Role: 1\n- RoleBinding: 1\n- ClusterRole: 1\n- ClusterRoleBinding: 1\n- CronJob: 1\n- Job: 1\n\n**Total: 13 resource types (14 resources)**", + "filesRead": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-state-cluster-cassandra.md", + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/subsystems/app-common.md", + "diagrams/all-contracts.mmd" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-state-cluster-cassandra.md", + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/subsystems/app-common.md", + "diagrams/all-contracts.mmd" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 5, + "notes": "AI answer matches ground truth exactly across all 13 resource types with correct counts. No hallucinations, omissions, or errors detected. Perfect alignment with expected output." + }, + "composite": 100, + "notFound": false + }, + { + "id": "resource-jenkins-001", + "category": "resource", + "difficulty": "medium", + "question": "What Kubernetes resource types does the jenkins chart generate? List each type and count.", + "groundTruth": "ConfigMap: 5\nService: 3\nSecret: 3\nIngress: 2\nServiceAccount: 2\nPersistentVolumeClaim: 1\nSecurityGroupPolicy: 1\nPrometheusRule: 1\nBackendConfig: 1\nNetworkPolicy: 1\nPodDisruptionBudget: 1\nPodMonitoring: 1\nRoute: 1\nServiceMonitor: 1\nStatefulSet: 1\nRole: 1\nRoleBinding: 1\nSecretClaim: 1\nPod: 1", + "llmAnswer": "Based on the Jenkins chart documentation, here are the Kubernetes resource types generated:\n\n- ConfigMap: 5\n- Service: 3\n- Secret: 3\n- Ingress: 2\n- ServiceAccount: 2\n- StatefulSet: 1\n- Role: 1\n- RoleBinding: 1\n- PersistentVolumeClaim: 1\n- SecurityGroupPolicy: 1\n- PrometheusRule: 1\n- BackendConfig: 1\n- NetworkPolicy: 1\n- PodDisruptionBudget: 1\n- PodMonitoring: 1\n- Route: 1\n- ServiceMonitor: 1\n- SecretClaim: 1\n- Pod: 1", + "filesRead": [ + "reference/helm/charts/app-common-charts-jenkins.md", + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/subsystems/app-common.md", + "reference/system-architecture.md" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-charts-jenkins.md", + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/subsystems/app-common.md", + "reference/system-architecture.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 4, + "notes": "AI answer matches ground truth exactly across all 19 resource types with correct counts. No hallucinations or omissions detected. Navigation score reduced slightly because the response doesn't demonstrate the process of finding the information (e.g., which files were consulted), though the accuracy suggests proper information retrieval." + }, + "composite": 95, + "notFound": false + }, + { + "id": "cross-sub-001", + "category": "cross-subsystem", + "difficulty": "hard", + "question": "Which subsystems depend on app-tools, and how heavily (by call+import count)?", + "groundTruth": "ipam-tools: 6\ncompute-tools: 4\napp-common: 1", + "llmAnswer": "Based on the Cross-Subsystem Dependencies table in reference/system-architecture.md:\n\n- **ipam-tools**: 6 calls + 0 imports = **6 total**\n- **compute-tools**: 4 calls + 0 imports = **4 total**\n- **app-common**: 1 call + 0 imports = **1 total**", + "filesRead": [ + "reference/contracts/index.md", + "reference/subsystems/app-tools.md", + "reference/system-architecture.md", + "reference/subsystems/runtime.md", + "reference/helm/index.md" + ], + "plannedFiles": [ + "reference/contracts/index.md", + "reference/subsystems/app-tools.md", + "reference/system-architecture.md", + "reference/subsystems/runtime.md", + "reference/helm/index.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 5, + "notes": "Answer matches ground truth exactly on all three subsystems with correct dependency counts. Agent demonstrated clear navigation by citing the specific source file and table. Breakdown of calls vs imports adds helpful detail without introducing errors or hallucinations." + }, + "composite": 100, + "notFound": false + }, + { + "id": "arch-002", + "category": "architectural", + "difficulty": "easy", + "question": "What is the current version and appVersion of the mdm-app Helm chart?", + "groundTruth": "version: 0.7.4, appVersion: 2025.12-12-develop", + "llmAnswer": "**Version:** 0.7.4 \n**App Version:** 2025.12-12-develop", + "filesRead": [ + "reference/helm/charts/app-common-charts-mdm-app.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-charts-mdm-app.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 5, + "notes": "AI provided both required values (version 0.7.4 and appVersion 2025.12-12-develop) with exact accuracy. No hallucinations or extraneous information. The correct values indicate successful location of the Helm chart metadata." + }, + "composite": 100, + "notFound": false + } + ] +} \ No newline at end of file diff --git a/eval-agent-report-v4.json b/eval-agent-report-v4.json new file mode 100644 index 0000000..61ddee7 --- /dev/null +++ b/eval-agent-report-v4.json @@ -0,0 +1,721 @@ +{ + "evalType": "agent", + "timestamp": "2026-03-10T00:33:05.446Z", + "docsDir": "./foxtrot-docs", + "totalQuestions": 22, + "overallScore": "88.4", + "avgAccuracy": "4.59", + "avgCompleteness": "4.50", + "avgPrecision": "4.55", + "avgNavigation": "4.05", + "notFoundCount": 0, + "notFoundRate": "0.0%", + "byCategory": { + "structural": { + "avg": "91.7", + "count": 3 + }, + "dependency": { + "avg": "89.0", + "count": 5 + }, + "contract": { + "avg": "65.0", + "count": 1 + }, + "configuration": { + "avg": "87.0", + "count": 5 + }, + "interaction": { + "avg": "62.5", + "count": 2 + }, + "resource": { + "avg": "100.0", + "count": 4 + }, + "cross-subsystem": { + "avg": "100.0", + "count": 1 + }, + "architectural": { + "avg": "100.0", + "count": 1 + } + }, + "byDifficulty": { + "easy": { + "avg": "89.4", + "count": 8 + }, + "medium": { + "avg": "92.1", + "count": 12 + }, + "hard": { + "avg": "62.5", + "count": 2 + } + }, + "results": [ + { + "id": "struct-001", + "category": "structural", + "difficulty": "easy", + "question": "How many Helm charts are defined across the Foxtrot monorepo?", + "groundTruth": "76", + "llmAnswer": "\n\n76", + "filesRead": [ + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/subsystems/runtime.md", + "reference/system-architecture.md", + "reference/contracts/index.md" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/subsystems/runtime.md", + "reference/system-architecture.md", + "reference/contracts/index.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 2, + "notes": "AI provided the exact correct answer (76), but with zero explanation or demonstration of how it found this information. No working shown, no file paths referenced, no methodology explained. While the accuracy is perfect, the complete lack of transparency into the navigation process makes it impossible to verify the agent actually located and analyzed the relevant information versus providing a memorized or lucky answer." + }, + "composite": 85, + "notFound": false + }, + { + "id": "struct-002", + "category": "structural", + "difficulty": "medium", + "question": "Which 5 Helm charts produce the most Kubernetes resources? List them with their resource counts.", + "groundTruth": "cluster (app-common/apps-provider/addons/charts/common/cluster): 55\ncluster (app-common/charts/cluster): 32\njenkins (app-common/charts/jenkins): 29\nkube-state-metrics (app-common/charts/otel-collector/charts/kube-state-metrics): 27\nfoxtrot-eks (compute-common/templates/foxtrot-eks): 24", + "llmAnswer": "1. cluster (`app-common/apps-provider/addons/charts/common/cluster`) - 55 resources\n2. cluster (`app-common/charts/cluster`) - 32 resources\n3. jenkins (`app-common/charts/jenkins`) - 29 resources\n4. kube-state-metrics (`app-common/charts/otel-collector/charts/kube-state-metrics`) - 27 resources\n5. foxtrot-eks (`compute-common/templates/foxtrot-eks`) - 24 resources", + "filesRead": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/system-architecture.md", + "reference/subsystems/app-common.md", + "reference/subsystems/runtime.md" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/system-architecture.md", + "reference/subsystems/app-common.md", + "reference/subsystems/runtime.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 4, + "notes": "AI answer matches ground truth exactly across all 5 items with correct resource counts and paths in proper ranking order. No hallucinations or extraneous information. Navigation score is 4 rather than 5 because process visibility is limited, but the specificity and accuracy of results strongly indicate effective information retrieval." + }, + "composite": 95, + "notFound": false + }, + { + "id": "struct-003", + "category": "structural", + "difficulty": "easy", + "question": "How many subsystems does the Foxtrot codebase contain?", + "groundTruth": "12", + "llmAnswer": "\n\n12", + "filesRead": [ + "reference/system-architecture.md", + "reference/subsystems/root.md", + "reference/subsystems/app-common.md", + "reference/subsystems/control-core.md", + "reference/subsystems/runtime.md" + ], + "plannedFiles": [ + "reference/system-architecture.md", + "reference/subsystems/root.md", + "reference/subsystems/app-common.md", + "reference/subsystems/control-core.md", + "reference/subsystems/runtime.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 4, + "notes": "AI provided exact correct answer matching ground truth. Answer is concise with no hallucinations or extraneous information. Navigation scored 4 rather than 5 due to lack of visible process documentation showing direct path to source." + }, + "composite": 95, + "notFound": false + }, + { + "id": "dep-external-dns-001", + "category": "dependency", + "difficulty": "medium", + "question": "What are the dependencies of the \"external-dns\" chart (at app-common/apps-provider/addons/charts/common/external-dns)?", + "groundTruth": "external-dns (1.17.0) [condition: external-dns.enabled]", + "llmAnswer": "Based on the documentation provided, the dependencies of the \"external-dns\" chart at `app-common/apps-provider/addons/charts/common/external-dns` are:\n\n- **external-dns** (1.17.0) *if external-dns.enabled*", + "filesRead": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-common-external-dns.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "diagrams/app-common-contracts.mmd" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-common-external-dns.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "diagrams/app-common-contracts.mmd" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 4, + "notes": "AI correctly identified the single dependency (external-dns 1.17.0) with the exact condition (external-dns.enabled). No hallucinations or extraneous information. Formatting differs slightly from ground truth (asterisks vs brackets for condition) but factual content is identical. Navigation score reduced from 5 only because the response doesn't explicitly show the path taken to locate the information, though the accuracy indicates successful file discovery." + }, + "composite": 95, + "notFound": false + }, + { + "id": "dep-ingress-nginx-001", + "category": "dependency", + "difficulty": "medium", + "question": "What are the dependencies of the \"ingress-nginx\" chart (at app-common/apps-provider/addons/charts/common/ingress-nginx)?", + "groundTruth": "ingress-nginx (4.12.0) [condition: ingress-nginx.enabled]", + "llmAnswer": "\n\nBased on the documentation provided, the dependencies of the \"ingress-nginx\" chart at `app-common/apps-provider/addons/charts/common/ingress-nginx` are:\n\n- ingress-nginx (4.12.0) *if ingress-nginx.enabled*", + "filesRead": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-common-ingress-nginx.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "diagrams/app-common-contracts.mmd" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-common-ingress-nginx.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "diagrams/app-common-contracts.mmd" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 4, + "notes": "AI correctly identified the dependency (ingress-nginx 4.12.0) and its condition. Answer matches ground truth exactly in content. Presentation of condition as 'if ingress-nginx.enabled' is clear and accurate. Navigation score reduced slightly because no explicit demonstration of file discovery process is shown, though correct answer suggests proper location was found." + }, + "composite": 95, + "notFound": false + }, + { + "id": "dep-kube-state-metrics-001", + "category": "dependency", + "difficulty": "medium", + "question": "What are the dependencies of the \"kube-state-metrics\" chart (at app-common/apps-provider/addons/charts/common/kube-state-metrics)?", + "groundTruth": "kube-state-metrics (5.26.0) [condition: kube-state-metrics.enabled]", + "llmAnswer": "\n\nBased on the documentation provided, the dependencies of the \"kube-state-metrics\" chart are:\n\n- kube-state-metrics (5.26.0) *if kube-state-metrics.enabled*", + "filesRead": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-common-kube-state-metrics.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "reference/system-architecture.md" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-common-kube-state-metrics.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "reference/system-architecture.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 4, + "notes": "AI correctly identified the dependency with exact version (5.26.0) and conditional requirement (kube-state-metrics.enabled). Answer matches ground truth completely with no hallucinations. Navigation score slightly lower due to lack of visible evidence of file discovery process, though correct answer indicates successful location of source material." + }, + "composite": 95, + "notFound": false + }, + { + "id": "dep-open-telemetry-001", + "category": "dependency", + "difficulty": "medium", + "question": "What are the dependencies of the \"open-telemetry\" chart (at app-common/apps-provider/addons/charts/common/otel-collector)?", + "groundTruth": "opentelemetry-operator (0.26.4) [condition: opentelemetry-operator.enabled]\nk8s-metrics (0.1.0) [condition: k8s-metrics.enabled]\nagent (0.1.1) [condition: agent.enabled]\ncommon (0.1.0) [condition: common.enabled]\ngw-metrics (0.1.0) [condition: gw-metrics.enabled]\ngw-traces (0.1.0) [condition: gw-traces.enabled]\nprom-scraper (0.1.0) [condition: prom-scraper.enabled]", + "llmAnswer": "\n\n- opentelemetry-operator (0.26.4)\n- k8s-metrics (0.1.0)\n- agent (0.1.1)\n- common (0.1.0)\n- gw-metrics (0.1.0)\n- gw-traces (0.1.0)\n- prom-scraper (0.1.0)", + "filesRead": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-common-otel-collector.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "diagrams/app-common-contracts.mmd" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-common-otel-collector.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "diagrams/app-common-contracts.mmd" + ], + "score": { + "accuracy": 5, + "completeness": 4, + "precision": 4, + "navigation": 4, + "notes": "AI correctly identified all 7 dependencies with accurate names and versions. However, the answer omits the conditional flags (e.g., '[condition: opentelemetry-operator.enabled]') that are part of the ground truth specification. This represents incomplete capture of the dependency metadata, though the core dependency information is accurate." + }, + "composite": 85, + "notFound": false + }, + { + "id": "dep-reltio-prometheus-001", + "category": "dependency", + "difficulty": "medium", + "question": "What are the dependencies of the \"reltio-prometheus\" chart (at app-common/apps-provider/addons/charts/common/prometheus)?", + "groundTruth": "kube-prometheus-stack (69.4.1) [condition: kube-prometheus-stack.enabled]\nprometheus-adapter (4.11.0) [condition: prometheus-adapter.enabled]", + "llmAnswer": "Based on the documentation for the reltio-prometheus chart, the dependencies are:\n\n- kube-prometheus-stack (69.4.1)\n- prometheus-adapter (4.11.0)", + "filesRead": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-common-prometheus.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "reference/system-architecture.md" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-common-prometheus.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "reference/system-architecture.md" + ], + "score": { + "accuracy": 5, + "completeness": 3, + "precision": 4, + "navigation": 3, + "notes": "AI correctly identified both dependencies with accurate versions, but omitted the conditional clauses ([condition: kube-prometheus-stack.enabled] and [condition: prometheus-adapter.enabled]) that are part of the ground truth. These conditions are critical metadata in Helm chart dependencies. No clear evidence of navigation to the actual Chart.yaml file shown in the response." + }, + "composite": 75, + "notFound": false + }, + { + "id": "contract-001", + "category": "contract", + "difficulty": "medium", + "question": "Which secrets or ConfigMaps are shared across multiple Helm charts? List each with the charts that use it.", + "groundTruth": "logdna-agent-key: cluster\nlogdna-agent-tags: cluster\nvault-secret: ai-app, cleanse-app, pms-app, rdm-app, shared-app\nvault-secret-rdm-tasks: ai-app, cleanse-app, pms-app, rdm-app, shared-app", + "llmAnswer": "Based on the helm-interactions diagram in the provided documentation, the following secrets/ConfigMaps are shared across multiple Helm charts:\n\n1. **logdna-agent-key**\n - cluster (app-common/apps-provider/addons/charts/common/cluster)\n - cluster (app-common/charts/cluster)\n\n2. **logdna-agent-tags**\n - cluster (app-common/apps-provider/addons/charts/common/cluster)\n - cluster (app-common/charts/cluster)\n\n3. **vault-secret**\n - ai-app\n - cleanse-app\n - pms-app\n - rdm-app\n - shared-app\n\n4. **vault-secret-rdm-tasks**\n - ai-app\n - cleanse-app\n - pms-app\n - rdm-app\n - shared-app", + "filesRead": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/subsystems/app-common.md", + "reference/system-architecture.md", + "diagrams/helm-interactions.mmd" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/subsystems/app-common.md", + "reference/system-architecture.md", + "diagrams/helm-interactions.mmd" + ], + "score": { + "accuracy": 3, + "completeness": 5, + "precision": 2, + "navigation": 3, + "notes": "The answer correctly identifies all four secrets/ConfigMaps and their associations. However, it duplicates 'cluster' entries for logdna-agent-key and logdna-agent-tags (showing two different paths), when the ground truth shows each should appear once. This duplication represents hallucinated extra information not in the ground truth. The vault-secret entries are accurate and precise. The agent demonstrated ability to locate relevant documentation but may have misinterpreted the source data regarding the cluster entries." + }, + "composite": 65, + "notFound": false + }, + { + "id": "config-mdm-app-count", + "category": "configuration", + "difficulty": "easy", + "question": "How many top-level configuration keys does the mdm-app chart (at app-common/charts/mdm-app) expose in its values.yaml?", + "groundTruth": "38", + "llmAnswer": "38", + "filesRead": [ + "reference/helm/charts/app-common-charts-mdm-app.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-charts-mdm-app.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 5, + "notes": "AI provided exact correct count (38) matching ground truth. Answer is factually accurate, complete, and free of hallucination. Agent successfully located and analyzed the correct values.yaml file." + }, + "composite": 100, + "notFound": false + }, + { + "id": "config-ai-app-count", + "category": "configuration", + "difficulty": "easy", + "question": "How many top-level configuration keys does the ai-app chart (at app-common/charts/ai-app) expose in its values.yaml?", + "groundTruth": "34", + "llmAnswer": "34", + "filesRead": [ + "reference/helm/charts/app-common-charts-ai-app.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-charts-ai-app.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 4, + "notes": "AI provided the exact correct count of 34 top-level keys. Answer is factually accurate, complete for the question asked, and contains no extraneous information. Navigation score is 4 rather than 5 because while the correct answer indicates the agent found the right file, the evaluation lacks explicit evidence of the navigation process or steps taken to arrive at the answer." + }, + "composite": 95, + "notFound": false + }, + { + "id": "config-elasticsearch-count", + "category": "configuration", + "difficulty": "easy", + "question": "How many top-level configuration keys does the elasticsearch chart (at app-common/apps-provider/addons/charts/state-apps/es) expose in its values.yaml?", + "groundTruth": "32", + "llmAnswer": "32", + "filesRead": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-state-apps-es.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-state-apps-es.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 5, + "notes": "AI answer matches ground truth exactly. The agent correctly identified 32 top-level configuration keys in the elasticsearch chart's values.yaml file. No extraneous information or hallucination present. Perfect alignment with expected answer." + }, + "composite": 100, + "notFound": false + }, + { + "id": "config-hazelcast-count", + "category": "configuration", + "difficulty": "easy", + "question": "How many top-level configuration keys does the hazelcast chart (at app-common/apps-provider/addons/charts/state-apps/hz) expose in its values.yaml?", + "groundTruth": "32", + "llmAnswer": "32", + "filesRead": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-state-apps-hz.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-state-apps-hz.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 4, + "notes": "AI provided the exact correct answer (32) matching ground truth. No hallucinated information or extraneous details. Navigation score slightly lower because the response doesn't show the agent's process of locating and examining the values.yaml file, though the correct answer strongly implies successful file discovery." + }, + "composite": 95, + "notFound": false + }, + { + "id": "config-redis-count", + "category": "configuration", + "difficulty": "easy", + "question": "How many top-level configuration keys does the redis chart (at app-common/charts/redis) expose in its values.yaml?", + "groundTruth": "23", + "llmAnswer": "\n\n22", + "filesRead": [ + "reference/helm/charts/app-common-charts-redis.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-charts-redis.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "score": { + "accuracy": 1, + "completeness": 1, + "precision": 4, + "navigation": 3, + "notes": "AI provided 22 instead of 23 top-level keys. The answer is off by exactly one, suggesting the agent likely located the correct values.yaml file but miscounted or overlooked a single configuration key. The response format is precise but the factual content is incorrect." + }, + "composite": 45, + "notFound": false + }, + { + "id": "interaction-001", + "category": "interaction", + "difficulty": "medium", + "question": "Which Helm charts reference Kubernetes services from other components? List each chart and the service it calls.", + "groundTruth": "hazelcast (app-common/apps-provider/addons/charts/state-apps/hz) → kubernetes\nhazelcast (app-common/charts/hazelcast) → kubernetes\ngw-metrics (app-common/charts/otel-collector/charts/gw-metrics) → argo-rollouts-metrics\nredis (app-common/charts/redis) → headless", + "llmAnswer": "Based on the helm-interactions diagram in the provided files, the following Helm charts reference Kubernetes services from other components:\n\n1. **hazelcast** (`app-common/apps-provider/addons/charts/state-apps/hz`) → `kubernetes` service\n2. **hazelcast** (`app-common/charts/hazelcast`) → `kubernetes` service\n3. **gw-metrics** (`app-common/charts/otel-collector/charts/gw-metrics`) → `argo-rollouts-metrics` service\n4. **redis** (`app-common/charts/redis`) → `headless` service", + "filesRead": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "diagrams/helm-interactions.mmd", + "reference/system-architecture.md", + "reference/subsystems/app-common.md" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "diagrams/helm-interactions.mmd", + "reference/system-architecture.md", + "reference/subsystems/app-common.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 5, + "notes": "AI answer matches ground truth exactly on all four chart-service pairs with correct paths. Minor stylistic addition of 'service' descriptor after each service name adds clarity without introducing errors. No hallucinations or missing items detected." + }, + "composite": 100, + "notFound": false + }, + { + "id": "interaction-002", + "category": "interaction", + "difficulty": "hard", + "question": "Which network ports are used by multiple Helm charts? List each port and the charts that expose it.", + "groundTruth": "Port 443: cluster, twistlock-defender, eck-operator, twistlock\nPort 8889: gw-metrics, gw-traces, kube-state-metrics\nPort 9200: eck-operator, elasticsearch\nPort 13133: agent, gw-metrics, gw-traces, k8s-metrics, prom-scraper, elasticsearch", + "llmAnswer": "Based on the Helm charts documentation provided, here are the network ports used by multiple charts:\n\n**Port 443:**\n- cluster (app-common/apps-provider/addons/charts/common/cluster)\n- twistlock-defender (app-common/apps-provider/addons/charts/common/twistlock)\n- eck-operator (app-common/apps-provider/addons/charts/state-apps/eck-operator)\n\nThis is the only port that appears across multiple Helm chart service definitions in the provided documentation.", + "filesRead": [ + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/system-architecture.md", + "diagrams/helm-interactions.mmd", + "reference/subsystems/network-common.md" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/system-architecture.md", + "diagrams/helm-interactions.mmd", + "reference/subsystems/network-common.md" + ], + "score": { + "accuracy": 2, + "completeness": 1, + "precision": 1, + "navigation": 1, + "notes": "AI identified port 443 with 3 of 4 correct charts but missed 'twistlock' chart. More critically, it completely failed to identify ports 8889, 9200, and 13133 that are used by multiple charts per ground truth. The claim that port 443 is 'the only port' is factually incorrect and contradicts the ground truth. The agent's search was insufficient in scope and comprehensiveness." + }, + "composite": 25, + "notFound": false + }, + { + "id": "resource-001", + "category": "resource", + "difficulty": "medium", + "question": "What are the 10 most common Kubernetes resource types generated across all Foxtrot Helm charts?", + "groundTruth": "ClusterRole: 51\nServiceAccount: 49\nSecret: 43\nExternalSecret: 38\nService: 36\nRole: 30\nClusterRoleBinding: 29\nDeployment: 29\nConfigMap: 25\nRoleBinding: 25", + "llmAnswer": "\n\nBased on the \"Kubernetes Resource Types (across all charts)\" table in reference/system-architecture.md, the 10 most common Kubernetes resource types are:\n\n1. ClusterRole - 51\n2. ServiceAccount - 49\n3. Secret - 43\n4. ExternalSecret - 38\n5. Service - 36\n6. Role - 30\n7. ClusterRoleBinding - 29\n8. Deployment - 29\n9. ConfigMap - 25\n10. RoleBinding - 25", + "filesRead": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "diagrams/all-contracts.mmd", + "reference/system-architecture.md", + "reference/subsystems/app-common.md" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "diagrams/all-contracts.mmd", + "reference/system-architecture.md", + "reference/subsystems/app-common.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 5, + "notes": "Perfect match to ground truth. All 10 resource types listed with correct counts and rankings in proper order. Agent cited the specific source file and table, demonstrating successful information retrieval." + }, + "composite": 100, + "notFound": false + }, + { + "id": "resource-mdm-app-001", + "category": "resource", + "difficulty": "medium", + "question": "What Kubernetes resource types does the mdm-app chart generate? List each type and count.", + "groundTruth": "Secret: 5\nJob: 4\nDeployment: 2\nPersistentVolumeClaim: 2\nService: 2\nExternalSecret: 2\nHorizontalPodAutoscaler: 1\nRollout: 1\nPersistentVolume: 1\nPodDisruptionBudget: 1\nServiceAccount: 1", + "llmAnswer": "Based on the mdm-app chart documentation, here are the Kubernetes resource types generated:\n\n- HorizontalPodAutoscaler: 1\n- Deployment: 2\n- Rollout: 1\n- Job: 4\n- PersistentVolume: 1\n- PersistentVolumeClaim: 2\n- Service: 2\n- PodDisruptionBudget: 1\n- Secret: 5\n- ExternalSecret: 2\n- ServiceAccount: 1\n\n**Total: 11 resource types**", + "filesRead": [ + "reference/helm/charts/app-common-charts-mdm-app.md", + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/subsystems/app-common.md", + "diagrams/app-common-contracts.mmd" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-charts-mdm-app.md", + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/subsystems/app-common.md", + "diagrams/app-common-contracts.mmd" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 5, + "notes": "Perfect match to ground truth. All 11 resource types listed with exact counts. No hallucinations, no omissions, no errors. Clean formatting and presentation." + }, + "composite": 100, + "notFound": false + }, + { + "id": "resource-cassandra-001", + "category": "resource", + "difficulty": "medium", + "question": "What Kubernetes resource types does the cassandra chart generate? List each type and count.", + "groundTruth": "Service: 2\nStatefulSet: 1\nExternalSecret: 1\nClusterSecretStore: 1\nConfigMap: 1\nSecret: 1\nServiceAccount: 1\nRole: 1\nRoleBinding: 1\nClusterRole: 1\nClusterRoleBinding: 1\nCronJob: 1\nJob: 1", + "llmAnswer": "Based on the cassandra chart documentation, here are the Kubernetes resource types generated:\n\n- Service: 2\n- StatefulSet: 1\n- ExternalSecret: 1\n- ClusterSecretStore: 1\n- ConfigMap: 1\n- Secret: 1\n- ServiceAccount: 1\n- Role: 1\n- RoleBinding: 1\n- ClusterRole: 1\n- ClusterRoleBinding: 1\n- CronJob: 1\n- Job: 1\n\n**Total: 13 resource types (14 resources)**", + "filesRead": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-state-cluster-cassandra.md", + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/subsystems/app-common.md", + "diagrams/all-contracts.mmd" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-state-cluster-cassandra.md", + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/subsystems/app-common.md", + "diagrams/all-contracts.mmd" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 5, + "notes": "AI answer matches ground truth exactly across all 13 resource types with correct counts. No hallucinations, omissions, or errors detected. Perfect alignment with expected output." + }, + "composite": 100, + "notFound": false + }, + { + "id": "resource-jenkins-001", + "category": "resource", + "difficulty": "medium", + "question": "What Kubernetes resource types does the jenkins chart generate? List each type and count.", + "groundTruth": "ConfigMap: 5\nService: 3\nSecret: 3\nIngress: 2\nServiceAccount: 2\nPersistentVolumeClaim: 1\nSecurityGroupPolicy: 1\nPrometheusRule: 1\nBackendConfig: 1\nNetworkPolicy: 1\nPodDisruptionBudget: 1\nPodMonitoring: 1\nRoute: 1\nServiceMonitor: 1\nStatefulSet: 1\nRole: 1\nRoleBinding: 1\nSecretClaim: 1\nPod: 1", + "llmAnswer": "Based on the \"Resources Generated\" section in the Jenkins chart documentation:\n\n- ConfigMap: 5\n- Service: 3\n- Secret: 3\n- Ingress: 2\n- ServiceAccount: 2\n- PersistentVolumeClaim: 1\n- SecurityGroupPolicy: 1\n- PrometheusRule: 1\n- BackendConfig: 1\n- NetworkPolicy: 1\n- PodDisruptionBudget: 1\n- PodMonitoring: 1\n- Route: 1\n- ServiceMonitor: 1\n- StatefulSet: 1\n- Role: 1\n- RoleBinding: 1\n- SecretClaim: 1\n- Pod: 1", + "filesRead": [ + "reference/helm/charts/app-common-charts-jenkins.md", + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/subsystems/app-common.md", + "reference/system-architecture.md" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-charts-jenkins.md", + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/subsystems/app-common.md", + "reference/system-architecture.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 5, + "notes": "AI answer matches ground truth exactly across all 19 resource types with correct counts. No hallucinations, no missing items. Proper citation of source documentation suggests successful navigation to correct information." + }, + "composite": 100, + "notFound": false + }, + { + "id": "cross-sub-001", + "category": "cross-subsystem", + "difficulty": "hard", + "question": "Which subsystems depend on app-tools, and how heavily (by call+import count)?", + "groundTruth": "ipam-tools: 6\ncompute-tools: 4\napp-common: 1", + "llmAnswer": "Based on the Cross-Subsystem Dependencies table in reference/system-architecture.md:\n\n- **ipam-tools**: 6 calls + 0 imports = **6 total**\n- **compute-tools**: 4 calls + 0 imports = **4 total**\n- **app-common**: 1 call + 0 imports = **1 total**", + "filesRead": [ + "reference/contracts/index.md", + "reference/subsystems/app-tools.md", + "reference/system-architecture.md", + "reference/subsystems/runtime.md", + "reference/helm/index.md" + ], + "plannedFiles": [ + "reference/contracts/index.md", + "reference/subsystems/app-tools.md", + "reference/system-architecture.md", + "reference/subsystems/runtime.md", + "reference/helm/index.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 5, + "notes": "Answer matches ground truth exactly on all three subsystems with correct dependency counts. Agent demonstrated clear navigation by citing the specific source file and table. Breakdown of calls vs imports adds helpful detail without introducing errors or hallucinations." + }, + "composite": 100, + "notFound": false + }, + { + "id": "arch-002", + "category": "architectural", + "difficulty": "easy", + "question": "What is the current version and appVersion of the mdm-app Helm chart?", + "groundTruth": "version: 0.7.4, appVersion: 2025.12-12-develop", + "llmAnswer": "**Version:** 0.7.4 \n**App Version:** 2025.12-12-develop", + "filesRead": [ + "reference/helm/charts/app-common-charts-mdm-app.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-charts-mdm-app.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 5, + "notes": "AI provided both required values (version 0.7.4 and appVersion 2025.12-12-develop) with exact accuracy. No hallucinations or extraneous information. The correct values indicate successful location of the Helm chart metadata." + }, + "composite": 100, + "notFound": false + } + ] +} \ No newline at end of file diff --git a/eval-agent-report-v5.json b/eval-agent-report-v5.json new file mode 100644 index 0000000..87d151e --- /dev/null +++ b/eval-agent-report-v5.json @@ -0,0 +1,721 @@ +{ + "evalType": "agent", + "timestamp": "2026-03-10T00:40:22.364Z", + "docsDir": "./foxtrot-docs", + "totalQuestions": 22, + "overallScore": "93.4", + "avgAccuracy": "4.82", + "avgCompleteness": "4.68", + "avgPrecision": "4.86", + "avgNavigation": "4.32", + "notFoundCount": 0, + "notFoundRate": "0.0%", + "byCategory": { + "structural": { + "avg": "93.3", + "count": 3 + }, + "dependency": { + "avg": "93.0", + "count": 5 + }, + "contract": { + "avg": "80.0", + "count": 1 + }, + "configuration": { + "avg": "98.0", + "count": 5 + }, + "interaction": { + "avg": "75.0", + "count": 2 + }, + "resource": { + "avg": "97.5", + "count": 4 + }, + "cross-subsystem": { + "avg": "100.0", + "count": 1 + }, + "architectural": { + "avg": "100.0", + "count": 1 + } + }, + "byDifficulty": { + "easy": { + "avg": "98.1", + "count": 8 + }, + "medium": { + "avg": "92.9", + "count": 12 + }, + "hard": { + "avg": "77.5", + "count": 2 + } + }, + "results": [ + { + "id": "struct-001", + "category": "structural", + "difficulty": "easy", + "question": "How many Helm charts are defined across the Foxtrot monorepo?", + "groundTruth": "76", + "llmAnswer": "76", + "filesRead": [ + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/subsystems/runtime.md", + "reference/system-architecture.md", + "reference/contracts/index.md" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/subsystems/runtime.md", + "reference/system-architecture.md", + "reference/contracts/index.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 5, + "notes": "AI provided the exact correct answer (76) matching ground truth. No extraneous information or hallucinations present. Correct answer indicates successful navigation to relevant Helm chart definitions in the monorepo." + }, + "composite": 100, + "notFound": false + }, + { + "id": "struct-002", + "category": "structural", + "difficulty": "medium", + "question": "Which 5 Helm charts produce the most Kubernetes resources? List them with their resource counts.", + "groundTruth": "cluster (app-common/apps-provider/addons/charts/common/cluster): 55\ncluster (app-common/charts/cluster): 32\njenkins (app-common/charts/jenkins): 29\nkube-state-metrics (app-common/charts/otel-collector/charts/kube-state-metrics): 27\nfoxtrot-eks (compute-common/templates/foxtrot-eks): 24", + "llmAnswer": "1. cluster (app-common/apps-provider/addons/charts/common/cluster) - 55 resources\n2. cluster (app-common/charts/cluster) - 32 resources\n3. jenkins - 29 resources\n4. kube-state-metrics - 27 resources\n5. foxtrot-eks - 24 resources", + "filesRead": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/system-architecture.md", + "reference/subsystems/app-common.md", + "reference/subsystems/runtime.md" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/system-architecture.md", + "reference/subsystems/app-common.md", + "reference/subsystems/runtime.md" + ], + "score": { + "accuracy": 4, + "completeness": 4, + "precision": 5, + "navigation": 4, + "notes": "All 5 items identified with correct resource counts in proper ranking order. However, items 3-5 (jenkins, kube-state-metrics, foxtrot-eks) are missing their full chart paths from the ground truth. Item 1 and 2 include full paths correctly, but the inconsistency and omission of paths for the remaining items indicates incomplete information capture despite accurate counts." + }, + "composite": 85, + "notFound": false + }, + { + "id": "struct-003", + "category": "structural", + "difficulty": "easy", + "question": "How many subsystems does the Foxtrot codebase contain?", + "groundTruth": "12", + "llmAnswer": "\n\n12", + "filesRead": [ + "reference/system-architecture.md", + "reference/subsystems/root.md", + "reference/subsystems/app-common.md", + "reference/subsystems/control-core.md", + "reference/subsystems/runtime.md" + ], + "plannedFiles": [ + "reference/system-architecture.md", + "reference/subsystems/root.md", + "reference/subsystems/app-common.md", + "reference/subsystems/control-core.md", + "reference/subsystems/runtime.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 4, + "notes": "AI provided exact correct answer matching ground truth. Answer is concise with no hallucinations or extraneous information. Navigation scored 4 rather than 5 due to lack of visible process documentation showing direct path to source." + }, + "composite": 95, + "notFound": false + }, + { + "id": "dep-external-dns-001", + "category": "dependency", + "difficulty": "medium", + "question": "What are the dependencies of the \"external-dns\" chart (at app-common/apps-provider/addons/charts/common/external-dns)?", + "groundTruth": "external-dns (1.17.0) [condition: external-dns.enabled]", + "llmAnswer": "Based on the documentation provided, the dependencies of the \"external-dns\" chart at `app-common/apps-provider/addons/charts/common/external-dns` are:\n\n- **external-dns** (1.17.0) *if external-dns.enabled*", + "filesRead": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-common-external-dns.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "diagrams/app-common-contracts.mmd" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-common-external-dns.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "diagrams/app-common-contracts.mmd" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 4, + "notes": "AI correctly identified the single dependency (external-dns 1.17.0) with the exact condition (external-dns.enabled). No hallucinations or extraneous information. Formatting differs slightly from ground truth (asterisks vs brackets for condition) but factual content is identical. Navigation score reduced from 5 only because the response doesn't explicitly show the path taken to locate the information, though the accuracy indicates successful file discovery." + }, + "composite": 95, + "notFound": false + }, + { + "id": "dep-ingress-nginx-001", + "category": "dependency", + "difficulty": "medium", + "question": "What are the dependencies of the \"ingress-nginx\" chart (at app-common/apps-provider/addons/charts/common/ingress-nginx)?", + "groundTruth": "ingress-nginx (4.12.0) [condition: ingress-nginx.enabled]", + "llmAnswer": "\n\nBased on the documentation file for the ingress-nginx chart at `app-common/apps-provider/addons/charts/common/ingress-nginx`, the dependencies are:\n\n- ingress-nginx (4.12.0) *if ingress-nginx.enabled*", + "filesRead": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-common-ingress-nginx.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "diagrams/app-common-contracts.mmd" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-common-ingress-nginx.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "diagrams/app-common-contracts.mmd" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 5, + "notes": "AI answer correctly identifies the single dependency (ingress-nginx 4.12.0) with its conditional requirement. Format differs slightly from ground truth notation but conveys identical information. No hallucinations or missing details." + }, + "composite": 100, + "notFound": false + }, + { + "id": "dep-kube-state-metrics-001", + "category": "dependency", + "difficulty": "medium", + "question": "What are the dependencies of the \"kube-state-metrics\" chart (at app-common/apps-provider/addons/charts/common/kube-state-metrics)?", + "groundTruth": "kube-state-metrics (5.26.0) [condition: kube-state-metrics.enabled]", + "llmAnswer": "\n\nBased on the documentation provided, the dependencies of the \"kube-state-metrics\" chart at `app-common/apps-provider/addons/charts/common/kube-state-metrics` are:\n\n- kube-state-metrics (5.26.0) - conditional on `kube-state-metrics.enabled`", + "filesRead": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-common-kube-state-metrics.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "reference/system-architecture.md" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-common-kube-state-metrics.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "reference/system-architecture.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 4, + "notes": "AI answer correctly identifies the single dependency (kube-state-metrics 5.26.0) with the proper condition flag. Formatting differs slightly from ground truth (prose vs. bracket notation) but conveys identical information. No hallucinations or extraneous details. Navigation score reduced from 5 only because actual file discovery process isn't visible, though correctness strongly indicates proper source location." + }, + "composite": 95, + "notFound": false + }, + { + "id": "dep-open-telemetry-001", + "category": "dependency", + "difficulty": "medium", + "question": "What are the dependencies of the \"open-telemetry\" chart (at app-common/apps-provider/addons/charts/common/otel-collector)?", + "groundTruth": "opentelemetry-operator (0.26.4) [condition: opentelemetry-operator.enabled]\nk8s-metrics (0.1.0) [condition: k8s-metrics.enabled]\nagent (0.1.1) [condition: agent.enabled]\ncommon (0.1.0) [condition: common.enabled]\ngw-metrics (0.1.0) [condition: gw-metrics.enabled]\ngw-traces (0.1.0) [condition: gw-traces.enabled]\nprom-scraper (0.1.0) [condition: prom-scraper.enabled]", + "llmAnswer": "Based on the documentation for the \"open-telemetry\" chart at `app-common/apps-provider/addons/charts/common/otel-collector`, the dependencies are:\n\n- opentelemetry-operator (0.26.4)\n- k8s-metrics (0.1.0)\n- agent (0.1.1)\n- common (0.1.0)\n- gw-metrics (0.1.0)\n- gw-traces (0.1.0)\n- prom-scraper (0.1.0)", + "filesRead": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-common-otel-collector.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "diagrams/app-common-contracts.mmd" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-common-otel-collector.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "diagrams/app-common-contracts.mmd" + ], + "score": { + "accuracy": 5, + "completeness": 4, + "precision": 5, + "navigation": 4, + "notes": "AI correctly identified all 7 dependencies with accurate names and versions. However, it omitted the conditional metadata ([condition: ...]) that appears in the ground truth. The core dependency information is complete and accurate, but the answer lacks the full context of when each dependency is enabled." + }, + "composite": 90, + "notFound": false + }, + { + "id": "dep-reltio-prometheus-001", + "category": "dependency", + "difficulty": "medium", + "question": "What are the dependencies of the \"reltio-prometheus\" chart (at app-common/apps-provider/addons/charts/common/prometheus)?", + "groundTruth": "kube-prometheus-stack (69.4.1) [condition: kube-prometheus-stack.enabled]\nprometheus-adapter (4.11.0) [condition: prometheus-adapter.enabled]", + "llmAnswer": "Based on the documentation provided, the dependencies of the \"reltio-prometheus\" chart are:\n\n- kube-prometheus-stack (69.4.1)\n- prometheus-adapter (4.11.0)", + "filesRead": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-common-prometheus.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "reference/system-architecture.md" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-common-prometheus.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/contracts/index.md", + "reference/system-architecture.md" + ], + "score": { + "accuracy": 5, + "completeness": 3, + "precision": 5, + "navigation": 4, + "notes": "AI correctly identified both dependencies with accurate versions but omitted the conditional metadata ([condition: kube-prometheus-stack.enabled] and [condition: prometheus-adapter.enabled]) that are part of the ground truth specification. No hallucinations or incorrect information present. Answer suggests successful file location but doesn't demonstrate navigation path." + }, + "composite": 85, + "notFound": false + }, + { + "id": "contract-001", + "category": "contract", + "difficulty": "medium", + "question": "Which secrets or ConfigMaps are shared across multiple Helm charts? List each with the charts that use it.", + "groundTruth": "logdna-agent-key: cluster\nlogdna-agent-tags: cluster\nvault-secret: ai-app, cleanse-app, pms-app, rdm-app, shared-app\nvault-secret-rdm-tasks: ai-app, cleanse-app, pms-app, rdm-app, shared-app", + "llmAnswer": "\n\nBased on the helm-interactions diagram in the provided documentation, the following secrets/ConfigMaps are shared across multiple Helm charts:\n\n1. **logdna-agent-key**\n - app-common/apps-provider/addons/charts/common/cluster\n - app-common/charts/cluster\n\n2. **logdna-agent-tags**\n - app-common/apps-provider/addons/charts/common/cluster\n - app-common/charts/cluster\n\n3. **vault-secret**\n - app-common/charts/ai-app\n - app-common/charts/cleanse-app\n - app-common/charts/pms-app\n - app-common/charts/rdm-app\n - app-common/charts/shared-app\n\n4. **vault-secret-rdm-tasks**\n - app-common/charts/ai-app\n - app-common/charts/cleanse-app\n - app-common/charts/pms-app\n - app-common/charts/rdm-app\n - app-common/charts/shared-app", + "filesRead": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/subsystems/app-common.md", + "reference/system-architecture.md", + "diagrams/helm-interactions.mmd" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/subsystems/app-common.md", + "reference/system-architecture.md", + "diagrams/helm-interactions.mmd" + ], + "score": { + "accuracy": 4, + "completeness": 5, + "precision": 3, + "navigation": 4, + "notes": "AI correctly identified all four secrets and their chart associations matching ground truth facts. However, it presented answers using full file paths (e.g., 'app-common/charts/ai-app') instead of the simplified chart names expected ('ai-app'). This format deviation reduces precision due to unnecessary extra information, though the core facts are accurate. Navigation was sound—agent found the helm-interactions diagram and extracted correct data." + }, + "composite": 80, + "notFound": false + }, + { + "id": "config-mdm-app-count", + "category": "configuration", + "difficulty": "easy", + "question": "How many top-level configuration keys does the mdm-app chart (at app-common/charts/mdm-app) expose in its values.yaml?", + "groundTruth": "38", + "llmAnswer": "38", + "filesRead": [ + "reference/helm/charts/app-common-charts-mdm-app.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-charts-mdm-app.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 5, + "notes": "AI provided exact correct count (38) matching ground truth. Answer is factually accurate, complete, and free of hallucination. Agent successfully located and analyzed the correct values.yaml file." + }, + "composite": 100, + "notFound": false + }, + { + "id": "config-ai-app-count", + "category": "configuration", + "difficulty": "easy", + "question": "How many top-level configuration keys does the ai-app chart (at app-common/charts/ai-app) expose in its values.yaml?", + "groundTruth": "34", + "llmAnswer": "34", + "filesRead": [ + "reference/helm/charts/app-common-charts-ai-app.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-charts-ai-app.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 4, + "notes": "AI provided the exact correct count of 34 top-level keys. Answer is factually accurate, complete for the question asked, and contains no extraneous information. Navigation score is 4 rather than 5 because while the correct answer indicates the agent found the right file, the evaluation lacks explicit evidence of the navigation process or steps taken to locate and analyze the values.yaml file." + }, + "composite": 95, + "notFound": false + }, + { + "id": "config-elasticsearch-count", + "category": "configuration", + "difficulty": "easy", + "question": "How many top-level configuration keys does the elasticsearch chart (at app-common/apps-provider/addons/charts/state-apps/es) expose in its values.yaml?", + "groundTruth": "32", + "llmAnswer": "32", + "filesRead": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-state-apps-es.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-state-apps-es.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 5, + "notes": "AI answer matches ground truth exactly. The agent correctly identified 32 top-level configuration keys in the elasticsearch chart's values.yaml file. No extraneous information or hallucination present. Perfect alignment with expected answer." + }, + "composite": 100, + "notFound": false + }, + { + "id": "config-hazelcast-count", + "category": "configuration", + "difficulty": "easy", + "question": "How many top-level configuration keys does the hazelcast chart (at app-common/apps-provider/addons/charts/state-apps/hz) expose in its values.yaml?", + "groundTruth": "32", + "llmAnswer": "32", + "filesRead": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-state-apps-hz.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-state-apps-hz.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 4, + "notes": "AI provided the exact correct answer (32) matching ground truth. No hallucinated information or extraneous details. Navigation score slightly lower because the response doesn't show the agent's process of locating and examining the values.yaml file, though the correct answer strongly implies successful file discovery." + }, + "composite": 95, + "notFound": false + }, + { + "id": "config-redis-count", + "category": "configuration", + "difficulty": "easy", + "question": "How many top-level configuration keys does the redis chart (at app-common/charts/redis) expose in its values.yaml?", + "groundTruth": "23", + "llmAnswer": "23", + "filesRead": [ + "reference/helm/charts/app-common-charts-redis.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-charts-redis.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 5, + "notes": "AI answer matches ground truth exactly (23). The precise numerical answer with no extraneous information indicates successful navigation to the correct file and accurate counting of top-level keys in values.yaml." + }, + "composite": 100, + "notFound": false + }, + { + "id": "interaction-001", + "category": "interaction", + "difficulty": "medium", + "question": "Which Helm charts reference Kubernetes services from other components? List each chart and the service it calls.", + "groundTruth": "hazelcast (app-common/apps-provider/addons/charts/state-apps/hz) → kubernetes\nhazelcast (app-common/charts/hazelcast) → kubernetes\ngw-metrics (app-common/charts/otel-collector/charts/gw-metrics) → argo-rollouts-metrics\nredis (app-common/charts/redis) → headless", + "llmAnswer": "Based on the interaction diagram and contracts in the provided documentation, the following Helm charts reference Kubernetes services from other components:\n\n1. **hazelcast** (`app-common/apps-provider/addons/charts/state-apps/hz`) → kubernetes service\n2. **hazelcast** (`app-common/charts/hazelcast`) → kubernetes service\n3. **gw-metrics** (`app-common/charts/otel-collector/charts/gw-metrics`) → argo-rollouts-metrics service\n4. **redis** (`app-common/charts/redis`) → headless service", + "filesRead": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "diagrams/helm-interactions.mmd", + "reference/system-architecture.md", + "reference/subsystems/app-common.md" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "diagrams/helm-interactions.mmd", + "reference/system-architecture.md", + "reference/subsystems/app-common.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 4, + "notes": "Answer matches ground truth exactly on all four items with correct paths and service references. The addition of 'service' after each service name is clarifying, not erroneous. No hallucinations or missing items. Navigation score slightly lower only because the interaction process isn't visible, but the specificity and correctness of paths suggests effective information retrieval." + }, + "composite": 95, + "notFound": false + }, + { + "id": "interaction-002", + "category": "interaction", + "difficulty": "hard", + "question": "Which network ports are used by multiple Helm charts? List each port and the charts that expose it.", + "groundTruth": "Port 443: cluster, twistlock-defender, eck-operator, twistlock\nPort 8889: gw-metrics, gw-traces, kube-state-metrics\nPort 9200: eck-operator, elasticsearch\nPort 13133: agent, gw-metrics, gw-traces, k8s-metrics, prom-scraper, elasticsearch", + "llmAnswer": "Based on the Helm charts documentation provided, the following network ports are used by multiple charts:\n\n- **port:443** - cluster, twistlock-defender, eck-operator\n- **port:9200** - eck-operator, elasticsearch", + "filesRead": [ + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/system-architecture.md", + "diagrams/helm-interactions.mmd", + "reference/subsystems/network-common.md" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/subsystems/app-common.md", + "reference/system-architecture.md", + "diagrams/helm-interactions.mmd", + "reference/subsystems/network-common.md" + ], + "score": { + "accuracy": 3, + "completeness": 2, + "precision": 4, + "navigation": 2, + "notes": "AI found ports 443 and 9200 with mostly correct chart associations, but missed 'twistlock' for port 443. Critical gaps: completely omitted ports 8889 and 13133, representing 50% data loss. No hallucinated information, but navigation failed to locate all relevant Helm chart definitions." + }, + "composite": 55, + "notFound": false + }, + { + "id": "resource-001", + "category": "resource", + "difficulty": "medium", + "question": "What are the 10 most common Kubernetes resource types generated across all Foxtrot Helm charts?", + "groundTruth": "ClusterRole: 51\nServiceAccount: 49\nSecret: 43\nExternalSecret: 38\nService: 36\nRole: 30\nClusterRoleBinding: 29\nDeployment: 29\nConfigMap: 25\nRoleBinding: 25", + "llmAnswer": "Based on the system architecture documentation, the 10 most common Kubernetes resource types generated across all Foxtrot Helm charts are:\n\n1. ClusterRole - 51\n2. ServiceAccount - 49\n3. Secret - 43\n4. ExternalSecret - 38\n5. Service - 36\n6. Role - 30\n7. ClusterRoleBinding - 29\n8. Deployment - 29\n9. ConfigMap - 25\n10. RoleBinding - 25", + "filesRead": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "diagrams/all-contracts.mmd", + "reference/system-architecture.md", + "reference/subsystems/app-common.md" + ], + "plannedFiles": [ + "reference/helm/index.md", + "reference/contracts/index.md", + "diagrams/all-contracts.mmd", + "reference/system-architecture.md", + "reference/subsystems/app-common.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 5, + "notes": "AI answer matches ground truth exactly across all 10 resource types with correct counts and ranking. No hallucinations or extraneous information present. Perfect alignment indicates successful information retrieval." + }, + "composite": 100, + "notFound": false + }, + { + "id": "resource-mdm-app-001", + "category": "resource", + "difficulty": "medium", + "question": "What Kubernetes resource types does the mdm-app chart generate? List each type and count.", + "groundTruth": "Secret: 5\nJob: 4\nDeployment: 2\nPersistentVolumeClaim: 2\nService: 2\nExternalSecret: 2\nHorizontalPodAutoscaler: 1\nRollout: 1\nPersistentVolume: 1\nPodDisruptionBudget: 1\nServiceAccount: 1", + "llmAnswer": "Based on the mdm-app chart documentation, here are the Kubernetes resource types generated:\n\n- HorizontalPodAutoscaler: 1\n- Deployment: 2\n- Rollout: 1\n- Job: 4\n- PersistentVolume: 1\n- PersistentVolumeClaim: 2\n- PodDisruptionBudget: 1\n- Secret: 5\n- ExternalSecret: 2\n- Service: 2\n- ServiceAccount: 1\n\n**Total: 22 resources**", + "filesRead": [ + "reference/helm/charts/app-common-charts-mdm-app.md", + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/subsystems/app-common.md", + "diagrams/app-common-contracts.mmd" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-charts-mdm-app.md", + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/subsystems/app-common.md", + "diagrams/app-common-contracts.mmd" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 4, + "notes": "AI answer matches ground truth exactly: all 11 resource types listed with correct counts totaling 22 resources. No hallucinations or omissions. Navigation scored 4 rather than 5 because the evaluation cannot verify the actual path taken to locate this specific mdm-app chart information, though perfect accuracy suggests effective information retrieval." + }, + "composite": 95, + "notFound": false + }, + { + "id": "resource-cassandra-001", + "category": "resource", + "difficulty": "medium", + "question": "What Kubernetes resource types does the cassandra chart generate? List each type and count.", + "groundTruth": "Service: 2\nStatefulSet: 1\nExternalSecret: 1\nClusterSecretStore: 1\nConfigMap: 1\nSecret: 1\nServiceAccount: 1\nRole: 1\nRoleBinding: 1\nClusterRole: 1\nClusterRoleBinding: 1\nCronJob: 1\nJob: 1", + "llmAnswer": "\n\nBased on the cassandra chart documentation, here are the Kubernetes resource types generated:\n\n- Service: 2\n- StatefulSet: 1\n- ExternalSecret: 1\n- ClusterSecretStore: 1\n- ConfigMap: 1\n- Secret: 1\n- ServiceAccount: 1\n- Role: 1\n- RoleBinding: 1\n- ClusterRole: 1\n- ClusterRoleBinding: 1\n- CronJob: 1\n- Job: 1\n\n**Total: 14 resources across 13 resource types**", + "filesRead": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-state-cluster-cassandra.md", + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/subsystems/app-common.md", + "diagrams/all-contracts.mmd" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-apps-provider-addons-charts-state-cluster-cassandra.md", + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/subsystems/app-common.md", + "diagrams/all-contracts.mmd" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 4, + "notes": "Answer matches ground truth exactly with all 13 resource types and correct counts. No hallucinations or extraneous information. Perfect factual accuracy and completeness. Navigation score slightly lower because the response doesn't explicitly demonstrate how the information was located, though the perfect accuracy indicates successful information retrieval." + }, + "composite": 95, + "notFound": false + }, + { + "id": "resource-jenkins-001", + "category": "resource", + "difficulty": "medium", + "question": "What Kubernetes resource types does the jenkins chart generate? List each type and count.", + "groundTruth": "ConfigMap: 5\nService: 3\nSecret: 3\nIngress: 2\nServiceAccount: 2\nPersistentVolumeClaim: 1\nSecurityGroupPolicy: 1\nPrometheusRule: 1\nBackendConfig: 1\nNetworkPolicy: 1\nPodDisruptionBudget: 1\nPodMonitoring: 1\nRoute: 1\nServiceMonitor: 1\nStatefulSet: 1\nRole: 1\nRoleBinding: 1\nSecretClaim: 1\nPod: 1", + "llmAnswer": "Based on the Jenkins chart documentation, here are the Kubernetes resource types generated:\n\n- ConfigMap: 5\n- Service: 3\n- Secret: 3\n- Ingress: 2\n- ServiceAccount: 2\n- StatefulSet: 1\n- Role: 1\n- RoleBinding: 1\n- PersistentVolumeClaim: 1\n- NetworkPolicy: 1\n- PodDisruptionBudget: 1\n- ServiceMonitor: 1\n- PodMonitoring: 1\n- PrometheusRule: 1\n- BackendConfig: 1\n- Route: 1\n- SecurityGroupPolicy: 1\n- SecretClaim: 1\n- Pod: 1", + "filesRead": [ + "reference/helm/charts/app-common-charts-jenkins.md", + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/subsystems/app-common.md", + "reference/system-architecture.md" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-charts-jenkins.md", + "reference/helm/index.md", + "reference/contracts/index.md", + "reference/subsystems/app-common.md", + "reference/system-architecture.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 5, + "notes": "AI answer matches ground truth exactly. All 19 resource types listed with correct counts. No omissions, no hallucinations, no incorrect information." + }, + "composite": 100, + "notFound": false + }, + { + "id": "cross-sub-001", + "category": "cross-subsystem", + "difficulty": "hard", + "question": "Which subsystems depend on app-tools, and how heavily (by call+import count)?", + "groundTruth": "ipam-tools: 6\ncompute-tools: 4\napp-common: 1", + "llmAnswer": "Based on the Cross-Subsystem Dependencies table in system-architecture.md:\n\n**Subsystems that depend on app-tools:**\n\n1. **ipam-tools** - 6 calls + 0 imports = **6 total**\n2. **compute-tools** - 4 calls + 0 imports = **4 total**\n3. **app-common** - 1 call + 0 imports = **1 total**", + "filesRead": [ + "reference/contracts/index.md", + "reference/subsystems/app-tools.md", + "reference/system-architecture.md", + "reference/subsystems/runtime.md", + "reference/helm/index.md" + ], + "plannedFiles": [ + "reference/contracts/index.md", + "reference/subsystems/app-tools.md", + "reference/system-architecture.md", + "reference/subsystems/runtime.md", + "reference/helm/index.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 5, + "notes": "AI answer matches ground truth exactly on all three subsystems and their dependency counts. Breakdown into calls+imports adds useful detail without introducing errors. Clear citation of source document (system-architecture.md) demonstrates proper navigation." + }, + "composite": 100, + "notFound": false + }, + { + "id": "arch-002", + "category": "architectural", + "difficulty": "easy", + "question": "What is the current version and appVersion of the mdm-app Helm chart?", + "groundTruth": "version: 0.7.4, appVersion: 2025.12-12-develop", + "llmAnswer": "**Version:** 0.7.4 \n**App Version:** 2025.12-12-develop", + "filesRead": [ + "reference/helm/charts/app-common-charts-mdm-app.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "plannedFiles": [ + "reference/helm/charts/app-common-charts-mdm-app.md", + "reference/helm/index.md", + "reference/subsystems/app-common.md" + ], + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "navigation": 5, + "notes": "AI provided both required values (version 0.7.4 and appVersion 2025.12-12-develop) with exact accuracy. No hallucinations or extraneous information. The correct values indicate successful location of the Helm chart metadata." + }, + "composite": 100, + "notFound": false + } + ] +} \ No newline at end of file diff --git a/eval-agent.js b/eval-agent.js index fe4b56c..0042617 100644 --- a/eval-agent.js +++ b/eval-agent.js @@ -15,7 +15,7 @@ const { callLLM } = require('./prose.js'); /** Simulate an agent browsing the doc tree with file tools */ async function agentBrowse(question, docsDir, llmOpts) { // Step 1: Agent sees the directory tree - const tree = buildTree(docsDir, '', 3); + const tree = buildTree(docsDir, '', 4); // Step 2: Agent picks which files to read based on the question + tree const planPrompt = `You are an AI agent with access to a documentation directory. You need to answer a question by browsing the file tree and reading specific files. @@ -49,8 +49,8 @@ Respond with ONLY the file paths, one per line. No explanation.`; if (fs.existsSync(absPath)) { try { const content = fs.readFileSync(absPath, 'utf8'); - // Cap per file at 15K chars - const truncated = content.length > 15000 ? content.substring(0, 15000) + '\n... (truncated)' : content; + // Cap per file at 30K chars to allow reading the full index + const truncated = content.length > 30000 ? content.substring(0, 30000) + '\n... (truncated)' : content; context += `\n=== ${relPath} ===\n${truncated}\n`; filesRead.push(relPath); } catch {} @@ -65,7 +65,7 @@ Respond with ONLY the file paths, one per line. No explanation.`; const absPath = path.join(docsDir, fb); if (fs.existsSync(absPath)) { const content = fs.readFileSync(absPath, 'utf8'); - context += `\n=== ${fb} ===\n${content.substring(0, 15000)}\n`; + context += `\n=== ${fb} ===\n${content.substring(0, 30000)}\n`; filesRead.push(fb); } } diff --git a/eval-questions.js b/eval-questions.js index 92e2491..8f8acb8 100644 --- a/eval-questions.js +++ b/eval-questions.js @@ -25,6 +25,20 @@ function generateQuestions(srcRoot, snapshotPath) { // Load graph + subsystems const graph = GraphStore.loadSnapshot(snapshotPath); + + // Merge Helm into graph (same as sysdoc.js does) + const { chartsToGraph } = require('./extract-helm.js'); + const helmGraph = chartsToGraph(charts, srcRoot); + for (const e of helmGraph.entities) { + const fakePath = e.dir ? path.join(srcRoot, e.dir, 'Chart.yaml') : path.join(srcRoot, 'Chart.yaml'); + graph.nodes.set(e.id, { ...e, type: e.type || 'Module', _file: fakePath }); + if (!graph.fileIndex.has(fakePath)) graph.fileIndex.set(fakePath, new Set()); + graph.fileIndex.get(fakePath).add(e.id); + } + for (const r of helmGraph.relationships) { + graph.edges.push(r); + } + const subs = buildSubsystems(graph, { srcDir: srcRoot.endsWith('/') ? srcRoot : srcRoot + '/', minTraffic: 3, diff --git a/eval-questions.json b/eval-questions.json index 3f34d6c..12218c9 100644 --- a/eval-questions.json +++ b/eval-questions.json @@ -1,5 +1,5 @@ { - "generated": "2026-03-09T21:29:29.763Z", + "generated": "2026-03-10T00:27:35.845Z", "count": 33, "questions": [ { @@ -37,7 +37,7 @@ "machine" ], "question": "How many subsystems does the Foxtrot codebase contain?", - "answer": "11", + "answer": "12", "answerType": "exact", "source": "subsystem aggregation" }, @@ -350,7 +350,7 @@ "human" ], "question": "Which subsystems are identified as cross-cutting concerns?", - "answer": "app-tools", + "answer": "root", "answerType": "list", "source": "subsystem aggregation" }, @@ -361,7 +361,7 @@ "audience": [ "human" ], - "question": "The following subsystems have 0 detected functions and 0 modules: account-common, network-common, network-core. Why might this be the case, and what do they actually contain?", + "question": "The following subsystems have 0 detected functions and 0 modules: account-common, network-common, network-core, root. Why might this be the case, and what do they actually contain?", "answer": "These subsystems primarily contain Helm charts with Go-templated YAML, Terraform HCL, and Crossplane compositions. The code analysis pipeline detects functions/modules from Python, Go, TypeScript, and shell scripts — but Helm templates use Go template syntax ({{ }}) which doesn't produce traditional function/module entities. Their content is captured through the Helm chart extraction phase instead.", "answerType": "explanation", "source": "architectural analysis" diff --git a/sysdoc.js b/sysdoc.js index 65f1f30..c20c5f0 100644 --- a/sysdoc.js +++ b/sysdoc.js @@ -199,14 +199,35 @@ ${sub.files.map(f => `- \`${f}\``).join('\n')} // Generate Reference: Helm Charts const helmIndexPath = path.join(outDir, 'reference/helm/index.md'); - let helmIndexContent = '# Helm Charts\n\n| Chart | Path | Version | Resources | Dependencies | Interactions |\n|---|---|---|---|---|---|\n'; + let helmIndexContent = '# Helm Charts\n\n'; + + // Name-to-file lookup for agent navigation + helmIndexContent += '## Quick Lookup (by chart name)\n\n'; + const nameGroups = {}; + for (const c of helmCharts) { + const safeName = c.dir.replace(/[^a-zA-Z0-9]/g, '-').replace(/-+/g, '-').replace(/^-|-$/g, ''); + if (!nameGroups[c.chart.name]) nameGroups[c.chart.name] = []; + nameGroups[c.chart.name].push({ dir: c.dir, safeName }); + } + for (const [name, entries] of Object.entries(nameGroups).sort((a, b) => a[0].localeCompare(b[0]))) { + if (entries.length === 1) { + helmIndexContent += `- **${name}** → [${entries[0].dir}](charts/${entries[0].safeName}.md)\n`; + } else { + helmIndexContent += `- **${name}**:\n`; + for (const e of entries) { + helmIndexContent += ` - [${e.dir}](charts/${e.safeName}.md)\n`; + } + } + } + + helmIndexContent += '\n## All Charts\n\n| Chart | Path | Version | AppVersion | Resources | Dependencies | Values Keys | Interactions |\n|---|---|---|---|---|---|---|---|\n'; // Use dir-based filenames to avoid collisions between same-named charts for (const c of helmCharts) { const safeName = c.dir.replace(/[^a-zA-Z0-9]/g, '-').replace(/-+/g, '-').replace(/^-|-$/g, ''); const chartDocPath = path.join(outDir, `reference/helm/charts/${safeName}.md`); - helmIndexContent += `| [${c.chart.name}](charts/${safeName}.md) | \`${c.dir}\` | ${c.chart.version} | ${c.templates.resources.length} | ${c.chart.dependencies.length} | ${c.interactions.length} |\n`; + helmIndexContent += `| [${c.chart.name}](charts/${safeName}.md) | \`${c.dir}\` | ${c.chart.version} | ${c.chart.appVersion || 'N/A'} | ${c.templates.resources.length} | ${c.chart.dependencies.map(d => d.name).join(', ') || 'none'} | ${c.values.keys.length} | ${c.interactions.length} |\n`; let chartContent = `# Chart: ${c.chart.name}\n\n`; chartContent += `**Version:** ${c.chart.version} \n`;