Phase 9c: Split eval into Agent (file-browsing) and Human (readability) tracks
Agent eval: 54.3% (22 questions, 40.9% NOT_FOUND) Human eval: 63.9% (28 questions, 17.9% NOT_FOUND) Key findings: - Agent navigation is the bottleneck (2.09/5) — long path-based filenames hurt discoverability - Human findability is decent (3.46/5) but dependency questions fail (0%) because chart docs for wrapper charts don't surface their sub-chart deps - Both tracks show strong precision (4.4+/5) — very low hallucination - Resources (91%) and interactions (95%) score great for humans - Configuration and contracts are solid across both tracks
This commit is contained in:
719
eval-agent-report.json
Normal file
719
eval-agent-report.json
Normal file
@@ -0,0 +1,719 @@
|
||||
{
|
||||
"evalType": "agent",
|
||||
"timestamp": "2026-03-09T23:49:59.481Z",
|
||||
"docsDir": "./foxtrot-docs",
|
||||
"totalQuestions": 22,
|
||||
"overallScore": "54.3",
|
||||
"avgAccuracy": "2.27",
|
||||
"avgCompleteness": "2.09",
|
||||
"avgPrecision": "4.41",
|
||||
"avgNavigation": "2.09",
|
||||
"notFoundCount": 9,
|
||||
"notFoundRate": "40.9%",
|
||||
"byCategory": {
|
||||
"structural": {
|
||||
"avg": "68.3",
|
||||
"count": 3
|
||||
},
|
||||
"dependency": {
|
||||
"avg": "38.0",
|
||||
"count": 5
|
||||
},
|
||||
"contract": {
|
||||
"avg": "75.0",
|
||||
"count": 1
|
||||
},
|
||||
"configuration": {
|
||||
"avg": "55.0",
|
||||
"count": 5
|
||||
},
|
||||
"interaction": {
|
||||
"avg": "62.5",
|
||||
"count": 2
|
||||
},
|
||||
"resource": {
|
||||
"avg": "50.0",
|
||||
"count": 4
|
||||
},
|
||||
"cross-subsystem": {
|
||||
"avg": "100.0",
|
||||
"count": 1
|
||||
},
|
||||
"architectural": {
|
||||
"avg": "25.0",
|
||||
"count": 1
|
||||
}
|
||||
},
|
||||
"byDifficulty": {
|
||||
"easy": {
|
||||
"avg": "51.3",
|
||||
"count": 8
|
||||
},
|
||||
"medium": {
|
||||
"avg": "54.6",
|
||||
"count": 12
|
||||
},
|
||||
"hard": {
|
||||
"avg": "65.0",
|
||||
"count": 2
|
||||
}
|
||||
},
|
||||
"results": [
|
||||
{
|
||||
"id": "struct-001",
|
||||
"category": "structural",
|
||||
"difficulty": "easy",
|
||||
"question": "How many Helm charts are defined across the Foxtrot monorepo?",
|
||||
"groundTruth": "76",
|
||||
"llmAnswer": "76",
|
||||
"filesRead": [
|
||||
"reference/helm/index.md",
|
||||
"reference/system-architecture.md",
|
||||
"reference/contracts/index.md"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/index.md",
|
||||
"reference/system-architecture.md",
|
||||
"reference/contracts/index.md"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 5,
|
||||
"completeness": 5,
|
||||
"precision": 5,
|
||||
"navigation": 5,
|
||||
"notes": "AI provided the exact correct answer (76) matching ground truth. No extraneous information or hallucinations present. Correct answer indicates successful navigation to relevant Helm chart definitions in the monorepo."
|
||||
},
|
||||
"composite": 100,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "struct-002",
|
||||
"category": "structural",
|
||||
"difficulty": "medium",
|
||||
"question": "Which 5 Helm charts produce the most Kubernetes resources? List them with their resource counts.",
|
||||
"groundTruth": "cluster (app-common/apps-provider/addons/charts/common/cluster): 55\ncluster (app-common/charts/cluster): 32\njenkins (app-common/charts/jenkins): 29\nkube-state-metrics (app-common/charts/otel-collector/charts/kube-state-metrics): 27\nfoxtrot-eks (compute-common/templates/foxtrot-eks): 24",
|
||||
"llmAnswer": "1. cluster (`app-common/apps-provider/addons/charts/common/cluster`) - 55 resources\n2. cluster (`app-common/charts/cluster`) - 32 resources\n3. jenkins (`app-common/charts/jenkins`) - 29 resources\n4. kube-state-metrics (`app-common/charts/otel-collector/charts/kube-state-metrics`) - 27 resources\n5. foxtrot-eks (`compute-common/templates/foxtrot-eks`) - 24 resources",
|
||||
"filesRead": [
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"reference/system-architecture.md"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"reference/system-architecture.md"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 5,
|
||||
"completeness": 5,
|
||||
"precision": 5,
|
||||
"navigation": 4,
|
||||
"notes": "AI answer matches ground truth exactly across all 5 items with correct resource counts and paths in proper ranking order. No hallucinations or extraneous information. Navigation score is 4 rather than 5 because process visibility is limited, but the specificity and accuracy of results strongly indicate effective information retrieval."
|
||||
},
|
||||
"composite": 95,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "struct-003",
|
||||
"category": "structural",
|
||||
"difficulty": "easy",
|
||||
"question": "How many subsystems does the Foxtrot codebase contain?",
|
||||
"groundTruth": "11",
|
||||
"llmAnswer": "\n\n12",
|
||||
"filesRead": [
|
||||
"reference/system-architecture.md",
|
||||
"reference/subsystems/root.md",
|
||||
"reference/contracts/index.md",
|
||||
"reference/helm/index.md",
|
||||
"diagrams/all-contracts.mmd"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/system-architecture.md",
|
||||
"reference/subsystems/root.md",
|
||||
"reference/contracts/index.md",
|
||||
"reference/helm/index.md",
|
||||
"diagrams/all-contracts.mmd"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 0,
|
||||
"completeness": 0,
|
||||
"precision": 1,
|
||||
"navigation": 1,
|
||||
"notes": "AI provided 12 subsystems instead of the correct answer of 11. Off-by-one error suggests either miscounting during analysis or hallucination of an additional subsystem. No evidence of proper source verification or navigation to authoritative documentation."
|
||||
},
|
||||
"composite": 10,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "dep-external-dns-001",
|
||||
"category": "dependency",
|
||||
"difficulty": "medium",
|
||||
"question": "What are the dependencies of the \"external-dns\" chart (at app-common/apps-provider/addons/charts/common/external-dns)?",
|
||||
"groundTruth": "external-dns (1.17.0) [condition: external-dns.enabled]",
|
||||
"llmAnswer": "NOT_FOUND\n\nThe documentation shows that the external-dns chart at `app-common/apps-provider/addons/charts/common/external-dns` has 1 dependency, but the specific name of that dependency is not listed in the provided files.",
|
||||
"filesRead": [
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/contracts/index.md",
|
||||
"diagrams/app-common-contracts.mmd",
|
||||
"reference/system-architecture.md"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/contracts/index.md",
|
||||
"diagrams/app-common-contracts.mmd",
|
||||
"reference/system-architecture.md"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 0,
|
||||
"completeness": 0,
|
||||
"precision": 5,
|
||||
"navigation": 0,
|
||||
"notes": "AI returned NOT_FOUND when ground truth confirms 1 dependency exists: external-dns (1.17.0) with condition external-dns.enabled. Agent failed to locate the Chart.yaml or dependency information at the specified path."
|
||||
},
|
||||
"composite": 25,
|
||||
"notFound": true
|
||||
},
|
||||
{
|
||||
"id": "dep-ingress-nginx-001",
|
||||
"category": "dependency",
|
||||
"difficulty": "medium",
|
||||
"question": "What are the dependencies of the \"ingress-nginx\" chart (at app-common/apps-provider/addons/charts/common/ingress-nginx)?",
|
||||
"groundTruth": "ingress-nginx (4.12.0) [condition: ingress-nginx.enabled]",
|
||||
"llmAnswer": "NOT_FOUND\n\nThe Helm Charts index shows that the ingress-nginx chart at `app-common/apps-provider/addons/charts/common/ingress-nginx` has 1 dependency, but the specific name of that dependency is not documented in the provided files.",
|
||||
"filesRead": [
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/contracts/index.md",
|
||||
"diagrams/app-common-contracts.mmd",
|
||||
"reference/system-architecture.md"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/contracts/index.md",
|
||||
"diagrams/app-common-contracts.mmd",
|
||||
"reference/system-architecture.md"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 0,
|
||||
"completeness": 0,
|
||||
"precision": 5,
|
||||
"navigation": 0,
|
||||
"notes": "AI failed to locate the ingress-nginx dependency. Ground truth confirms the chart has 1 dependency: ingress-nginx (4.12.0) with condition ingress-nginx.enabled. The NOT_FOUND response indicates the agent did not successfully navigate to or parse the chart's dependency metadata."
|
||||
},
|
||||
"composite": 25,
|
||||
"notFound": true
|
||||
},
|
||||
{
|
||||
"id": "dep-kube-state-metrics-001",
|
||||
"category": "dependency",
|
||||
"difficulty": "medium",
|
||||
"question": "What are the dependencies of the \"kube-state-metrics\" chart (at app-common/apps-provider/addons/charts/common/kube-state-metrics)?",
|
||||
"groundTruth": "kube-state-metrics (5.26.0) [condition: kube-state-metrics.enabled]",
|
||||
"llmAnswer": "NOT_FOUND\n\nThe documentation shows that the kube-state-metrics chart at `app-common/apps-provider/addons/charts/common/kube-state-metrics` has 1 dependency (as indicated in the Helm Charts index table), but the specific name of that dependency is not listed in the provided files.",
|
||||
"filesRead": [
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/contracts/index.md",
|
||||
"diagrams/app-common-contracts.mmd",
|
||||
"reference/system-architecture.md"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/contracts/index.md",
|
||||
"diagrams/app-common-contracts.mmd",
|
||||
"reference/system-architecture.md"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 0,
|
||||
"completeness": 0,
|
||||
"precision": 5,
|
||||
"navigation": 0,
|
||||
"notes": "AI answered NOT_FOUND when the dependency clearly exists in ground truth (kube-state-metrics 5.26.0). Failed to locate or extract the documented dependency information from the chart. No hallucinations present, but complete failure to find and report the correct answer."
|
||||
},
|
||||
"composite": 25,
|
||||
"notFound": true
|
||||
},
|
||||
{
|
||||
"id": "dep-open-telemetry-001",
|
||||
"category": "dependency",
|
||||
"difficulty": "medium",
|
||||
"question": "What are the dependencies of the \"open-telemetry\" chart (at app-common/apps-provider/addons/charts/common/otel-collector)?",
|
||||
"groundTruth": "opentelemetry-operator (0.26.4) [condition: opentelemetry-operator.enabled]\nk8s-metrics (0.1.0) [condition: k8s-metrics.enabled]\nagent (0.1.1) [condition: agent.enabled]\ncommon (0.1.0) [condition: common.enabled]\ngw-metrics (0.1.0) [condition: gw-metrics.enabled]\ngw-traces (0.1.0) [condition: gw-traces.enabled]\nprom-scraper (0.1.0) [condition: prom-scraper.enabled]",
|
||||
"llmAnswer": "Based on the Helm Charts table in `reference/helm/index.md`, the \"open-telemetry\" chart at `app-common/apps-provider/addons/charts/common/otel-collector` has 7 dependencies.\n\nFrom the contracts diagram, these dependencies are:\n\n1. agent\n2. common\n3. gw-metrics\n4. gw-traces\n5. k8s-metrics\n6. prom-scraper\n7. opentelemetry-operator",
|
||||
"filesRead": [
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/contracts/index.md",
|
||||
"diagrams/app-common-contracts.mmd",
|
||||
"reference/system-architecture.md"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/contracts/index.md",
|
||||
"diagrams/app-common-contracts.mmd",
|
||||
"reference/system-architecture.md"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 3,
|
||||
"completeness": 2,
|
||||
"precision": 4,
|
||||
"navigation": 2,
|
||||
"notes": "AI correctly identifies all 7 dependency names but omits critical specification details: version numbers (e.g., opentelemetry-operator 0.26.4) and conditional flags (e.g., [condition: opentelemetry-operator.enabled]). Navigation references generic documentation rather than demonstrating access to the specific Chart.yaml file at the given path. Answer is partially correct but substantially incomplete."
|
||||
},
|
||||
"composite": 55,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "dep-reltio-prometheus-001",
|
||||
"category": "dependency",
|
||||
"difficulty": "medium",
|
||||
"question": "What are the dependencies of the \"reltio-prometheus\" chart (at app-common/apps-provider/addons/charts/common/prometheus)?",
|
||||
"groundTruth": "kube-prometheus-stack (69.4.1) [condition: kube-prometheus-stack.enabled]\nprometheus-adapter (4.11.0) [condition: prometheus-adapter.enabled]",
|
||||
"llmAnswer": "Based on the Helm Charts index in the provided documentation, the \"reltio-prometheus\" chart has **2 dependencies**.\n\nFrom the contract definition shown in the system documentation, these dependencies are:\n\n1. kube-prometheus-stack\n2. prometheus-adapter",
|
||||
"filesRead": [
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/contracts/index.md",
|
||||
"diagrams/app-common-contracts.mmd",
|
||||
"reference/system-architecture.md"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/contracts/index.md",
|
||||
"diagrams/app-common-contracts.mmd",
|
||||
"reference/system-architecture.md"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 3,
|
||||
"completeness": 2,
|
||||
"precision": 4,
|
||||
"navigation": 3,
|
||||
"notes": "AI correctly identified both dependency names (kube-prometheus-stack, prometheus-adapter) but omitted critical details: version numbers (69.4.1, 4.11.0) and conditions (kube-prometheus-stack.enabled, prometheus-adapter.enabled). The answer is partially correct but substantially incomplete relative to ground truth specification."
|
||||
},
|
||||
"composite": 60,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "contract-001",
|
||||
"category": "contract",
|
||||
"difficulty": "medium",
|
||||
"question": "Which secrets or ConfigMaps are shared across multiple Helm charts? List each with the charts that use it.",
|
||||
"groundTruth": "logdna-agent-key: cluster\nlogdna-agent-tags: cluster\nvault-secret: ai-app, cleanse-app, pms-app, rdm-app, shared-app\nvault-secret-rdm-tasks: ai-app, cleanse-app, pms-app, rdm-app, shared-app",
|
||||
"llmAnswer": "Based on the interaction diagram in the provided documentation, the following secrets/ConfigMaps are shared across multiple Helm charts:\n\n1. **logdna-agent-key**\n - cluster (app-common/apps-provider/addons/charts/common/cluster)\n - cluster (app-common/charts/cluster)\n\n2. **logdna-agent-tags**\n - cluster (app-common/apps-provider/addons/charts/common/cluster)\n - cluster (app-common/charts/cluster)\n\n3. **vault-secret**\n - ai-app (app-common/charts/ai-app)\n - cleanse-app (app-common/charts/cleanse-app)\n - pms-app (app-common/charts/pms-app)\n - rdm-app (app-common/charts/rdm-app)\n - shared-app (app-common/charts/shared-app)\n\n4. **vault-secret-rdm-tasks**\n - ai-app (app-common/charts/ai-app)\n - cleanse-app (app-common/charts/cleanse-app)\n - pms-app (app-common/charts/pms-app)\n - rdm-app (app-common/charts/rdm-app)\n - shared-app (app-common/charts/shared-app)",
|
||||
"filesRead": [
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"diagrams/helm-interactions.mmd",
|
||||
"reference/system-architecture.md",
|
||||
"reference/subsystems/app-common.md"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"diagrams/helm-interactions.mmd",
|
||||
"reference/system-architecture.md",
|
||||
"reference/subsystems/app-common.md"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 4,
|
||||
"completeness": 5,
|
||||
"precision": 2,
|
||||
"navigation": 4,
|
||||
"notes": "AI correctly identified all four secrets and their associated charts. However, precision suffers from: (1) duplicating 'cluster' entries for logdna-agent-key and logdna-agent-tags with different paths when ground truth shows each only once, (2) including unnecessary full file paths that weren't requested, (3) formatting as numbered list with bold headers instead of the simple key-value format expected. The answer demonstrates successful information retrieval but poor adherence to the expected output format and includes potentially hallucinated path duplications."
|
||||
},
|
||||
"composite": 75,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "config-mdm-app-count",
|
||||
"category": "configuration",
|
||||
"difficulty": "easy",
|
||||
"question": "How many top-level configuration keys does the mdm-app chart (at app-common/charts/mdm-app) expose in its values.yaml?",
|
||||
"groundTruth": "38",
|
||||
"llmAnswer": "\n\nNOT_FOUND\n\nThe provided documentation files do not contain the complete contract definition for the mdm-app chart's values.yaml. While the chart is listed in the Helm Charts index (version 0.7.4 at `app-common/charts/mdm-app`), the detailed values schema showing the top-level configuration keys is not included in the excerpts provided.",
|
||||
"filesRead": [
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/contracts/index.md"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/contracts/index.md"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 0,
|
||||
"completeness": 0,
|
||||
"precision": 5,
|
||||
"navigation": 0,
|
||||
"notes": "AI failed to locate the mdm-app chart's values.yaml file and retrieve the top-level configuration key count. Response was 'NOT_FOUND' instead of the correct answer of 38 keys. While the agent was honest about not finding the information (no hallucinations), it did not successfully navigate to or examine the target file."
|
||||
},
|
||||
"composite": 25,
|
||||
"notFound": true
|
||||
},
|
||||
{
|
||||
"id": "config-ai-app-count",
|
||||
"category": "configuration",
|
||||
"difficulty": "easy",
|
||||
"question": "How many top-level configuration keys does the ai-app chart (at app-common/charts/ai-app) expose in its values.yaml?",
|
||||
"groundTruth": "34",
|
||||
"llmAnswer": "Looking at the `reference/contracts/index.md` file, I can find the `ai_app_values` class which defines the top-level configuration keys for the ai-app chart at `app-common/charts/ai-app`.\n\nCounting the properties listed in the `ai_app_values` class:\n\nclusterName, environmentType, arch, cloudProvider, region, vaultEnv, imagePullPolicy, clusterDomain, terminationGracePeriodSeconds, etalonsubDomain, domain, rootDomain, reltioDomains, spotinstPercent, nodeSelector, tolerations, ingress, serviceMesh, baseRegistryURL, imagePullSecrets, commonLabels, commonPodLabels, commonJobLabels, commonAnnotations, jobAnnotations, gcrregcred, global, commonservicesValues, efsClaimHeapDumps, securityContext, persistent, enableRdmAppServices, helmTimeout, hooks\n\n**34**",
|
||||
"filesRead": [
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"reference/system-architecture.md"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/charts/ai-app.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"reference/system-architecture.md"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 5,
|
||||
"completeness": 5,
|
||||
"precision": 5,
|
||||
"navigation": 5,
|
||||
"notes": "AI provided exact answer (34) matching ground truth. Demonstrated clear navigation by locating reference/contracts/index.md and identifying the ai_app_values class. Listed all 34 properties with no hallucinations or extraneous information. Methodology was sound and conclusion precise."
|
||||
},
|
||||
"composite": 100,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "config-elasticsearch-count",
|
||||
"category": "configuration",
|
||||
"difficulty": "easy",
|
||||
"question": "How many top-level configuration keys does the elasticsearch chart (at app-common/apps-provider/addons/charts/state-apps/es) expose in its values.yaml?",
|
||||
"groundTruth": "32",
|
||||
"llmAnswer": "Looking at the contracts documentation for the elasticsearch chart at `app-common/apps-provider/addons/charts/state-apps/es`, the `elasticsearch_values` class shows the following top-level configuration keys:\n\nenvName, esClusterName, domain, lb, cloudProvider, esVersion, esMasterNodePerZoneCount, esDataNodePerZoneCount, xpkNodeCount, esZone1, esZone2, esZone3, masterStorageSize, masterStorageClass, dataStorageSize, dataStorageClass, masterJavaOpts, masterCpus, masterMems, dataJavaOpts, dataCpus, dataMems, dataVNGName, arch, onDemand, customConfig, customConfig_master, kibanaDomain, authorizationPolicy, metrics, livenessProbe, enableAnonymousAccess\n\n**32**",
|
||||
"filesRead": [
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/contracts/index.md",
|
||||
"diagrams/app-common-contracts.mmd"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/helm/charts/",
|
||||
"reference/contracts/index.md",
|
||||
"diagrams/app-common-contracts.mmd"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 5,
|
||||
"completeness": 5,
|
||||
"precision": 5,
|
||||
"navigation": 5,
|
||||
"notes": "AI provided the correct count (32) with all keys explicitly listed. Demonstrated direct navigation to the elasticsearch_values class in the contracts documentation at the correct chart path. No hallucinations or extraneous information present."
|
||||
},
|
||||
"composite": 100,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "config-hazelcast-count",
|
||||
"category": "configuration",
|
||||
"difficulty": "easy",
|
||||
"question": "How many top-level configuration keys does the hazelcast chart (at app-common/apps-provider/addons/charts/state-apps/hz) expose in its values.yaml?",
|
||||
"groundTruth": "32",
|
||||
"llmAnswer": "NOT_FOUND\n\nThe specific chart documentation file for `app-common/apps-provider/addons/charts/state-apps/hz` is not included in the provided reference materials. While the chart is listed in the Helm Charts index, the detailed values.yaml documentation that would show the exact number of top-level configuration keys is not available.",
|
||||
"filesRead": [
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/contracts/index.md",
|
||||
"diagrams/app-common-contracts.mmd",
|
||||
"reference/system-architecture.md"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/contracts/index.md",
|
||||
"diagrams/app-common-contracts.mmd",
|
||||
"reference/system-architecture.md"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 0,
|
||||
"completeness": 0,
|
||||
"precision": 5,
|
||||
"navigation": 0,
|
||||
"notes": "AI agent failed to locate the hazelcast chart's values.yaml file at the specified path (app-common/apps-provider/addons/charts/state-apps/hz). The correct answer is 32 top-level configuration keys. While the agent's reasoning about missing reference materials is precise and doesn't hallucinate facts, it demonstrates complete failure in navigation and retrieval of the required information."
|
||||
},
|
||||
"composite": 25,
|
||||
"notFound": true
|
||||
},
|
||||
{
|
||||
"id": "config-redis-count",
|
||||
"category": "configuration",
|
||||
"difficulty": "easy",
|
||||
"question": "How many top-level configuration keys does the redis chart (at app-common/charts/redis) expose in its values.yaml?",
|
||||
"groundTruth": "23",
|
||||
"llmAnswer": "NOT_FOUND\n\nThe provided documentation includes the redis chart in the Helm Charts index (version 1.0.0 at `app-common/charts/redis`), but does not contain the detailed values.yaml structure showing the top-level configuration keys for this chart.",
|
||||
"filesRead": [
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"reference/helm/charts/redis/values.yaml"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 0,
|
||||
"completeness": 0,
|
||||
"precision": 5,
|
||||
"navigation": 0,
|
||||
"notes": "AI answered NOT_FOUND when the correct answer is 23 top-level configuration keys. While the agent's response reflects that detailed values.yaml structure wasn't in the provided documentation, it failed to retrieve or provide the factually correct answer. The agent demonstrated no navigation to the actual redis chart values.yaml file to count the keys."
|
||||
},
|
||||
"composite": 25,
|
||||
"notFound": true
|
||||
},
|
||||
{
|
||||
"id": "interaction-001",
|
||||
"category": "interaction",
|
||||
"difficulty": "medium",
|
||||
"question": "Which Helm charts reference Kubernetes services from other components? List each chart and the service it calls.",
|
||||
"groundTruth": "hazelcast (app-common/apps-provider/addons/charts/state-apps/hz) → kubernetes\nhazelcast (app-common/charts/hazelcast) → kubernetes\ngw-metrics (app-common/charts/otel-collector/charts/gw-metrics) → argo-rollouts-metrics\nredis (app-common/charts/redis) → headless",
|
||||
"llmAnswer": "Based on the documentation provided, here are the Helm charts that reference Kubernetes services from other components:\n\n1. **hazelcast** (`app-common/apps-provider/addons/charts/state-apps/hz`) - references `kubernetes` service\n2. **hazelcast** (`app-common/charts/hazelcast`) - references `kubernetes` service\n3. **gw-metrics** (`app-common/charts/otel-collector/charts/gw-metrics`) - references `argo-rollouts-metrics` service\n4. **redis** (`app-common/charts/redis`) - references `headless` service",
|
||||
"filesRead": [
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"reference/system-architecture.md",
|
||||
"diagrams/helm-interactions.mmd"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"reference/system-architecture.md",
|
||||
"diagrams/helm-interactions.mmd",
|
||||
"reference/helm/charts/"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 5,
|
||||
"completeness": 5,
|
||||
"precision": 5,
|
||||
"navigation": 4,
|
||||
"notes": "AI answer matches ground truth exactly on all four chart-service pairs with correct paths and service names. No hallucinations or omissions. Navigation score reduced to 4 rather than 5 due to lack of visibility into the discovery process, though results suggest effective information retrieval."
|
||||
},
|
||||
"composite": 95,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "interaction-002",
|
||||
"category": "interaction",
|
||||
"difficulty": "hard",
|
||||
"question": "Which network ports are used by multiple Helm charts? List each port and the charts that expose it.",
|
||||
"groundTruth": "Port 443: cluster, twistlock-defender, eck-operator, twistlock\nPort 8889: gw-metrics, gw-traces, kube-state-metrics\nPort 9200: eck-operator, elasticsearch\nPort 13133: agent, gw-metrics, gw-traces, k8s-metrics, prom-scraper, elasticsearch",
|
||||
"llmAnswer": "Based on the documentation provided, here are the network ports used by multiple Helm charts:\n\n**Port 443:**\n- cluster\n- twistlock-defender\n- eck-operator\n\nThis is the only port that appears across multiple Helm charts in the provided documentation.",
|
||||
"filesRead": [
|
||||
"reference/helm/index.md",
|
||||
"reference/system-architecture.md",
|
||||
"reference/contracts/index.md",
|
||||
"diagrams/helm-interactions.mmd",
|
||||
"reference/subsystems/network-core.md"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/index.md",
|
||||
"reference/system-architecture.md",
|
||||
"reference/contracts/index.md",
|
||||
"diagrams/helm-interactions.mmd",
|
||||
"reference/subsystems/network-core.md"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 2,
|
||||
"completeness": 1,
|
||||
"precision": 2,
|
||||
"navigation": 1,
|
||||
"notes": "AI identified only 1 of 4 ports (port 443) and missed one chart for that port (twistlock). Completely failed to identify ports 8889, 9200, and 13133. The claim that port 443 is 'the only port' contradicts ground truth showing 4 ports used by multiple charts. Demonstrates poor information retrieval or severely limited documentation access."
|
||||
},
|
||||
"composite": 30,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "resource-001",
|
||||
"category": "resource",
|
||||
"difficulty": "medium",
|
||||
"question": "What are the 10 most common Kubernetes resource types generated across all Foxtrot Helm charts?",
|
||||
"groundTruth": "ClusterRole: 51\nServiceAccount: 49\nSecret: 43\nExternalSecret: 38\nService: 36\nRole: 30\nClusterRoleBinding: 29\nDeployment: 29\nConfigMap: 25\nRoleBinding: 25",
|
||||
"llmAnswer": "\n\n1. ClusterRole - 51\n2. ServiceAccount - 49\n3. Secret - 43\n4. ExternalSecret - 38\n5. Service - 36\n6. Role - 30\n7. ClusterRoleBinding - 29\n8. Deployment - 29\n9. ConfigMap - 25\n10. RoleBinding - 25",
|
||||
"filesRead": [
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"reference/system-architecture.md",
|
||||
"diagrams/all-contracts.mmd",
|
||||
"reference/subsystems/app-common.md"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"reference/system-architecture.md",
|
||||
"diagrams/all-contracts.mmd",
|
||||
"reference/subsystems/app-common.md"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 5,
|
||||
"completeness": 5,
|
||||
"precision": 5,
|
||||
"navigation": 5,
|
||||
"notes": "AI answer matches ground truth exactly—all 10 resource types present with correct counts and ranking. No hallucinations or extraneous information. Perfect alignment across all dimensions."
|
||||
},
|
||||
"composite": 100,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "resource-mdm-app-001",
|
||||
"category": "resource",
|
||||
"difficulty": "medium",
|
||||
"question": "What Kubernetes resource types does the mdm-app chart generate? List each type and count.",
|
||||
"groundTruth": "Secret: 5\nJob: 4\nDeployment: 2\nPersistentVolumeClaim: 2\nService: 2\nExternalSecret: 2\nHorizontalPodAutoscaler: 1\nRollout: 1\nPersistentVolume: 1\nPodDisruptionBudget: 1\nServiceAccount: 1",
|
||||
"llmAnswer": "NOT_FOUND\n\nThe provided documentation shows that the mdm-app chart generates 22 resources total, but does not specify the individual Kubernetes resource types or their counts.",
|
||||
"filesRead": [
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"diagrams/helm-interactions.mmd",
|
||||
"reference/subsystems/app-common.md"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"diagrams/helm-interactions.mmd",
|
||||
"reference/helm/charts/",
|
||||
"reference/subsystems/app-common.md"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 0,
|
||||
"completeness": 0,
|
||||
"precision": 5,
|
||||
"navigation": 0,
|
||||
"notes": "Agent failed to locate the specific Kubernetes resource types and their counts from the mdm-app chart. While it identified that 22 resources total are generated, it did not extract the breakdown by resource type (Secret: 5, Job: 4, etc.). The answer 'NOT_FOUND' is honest but represents complete failure to answer the question despite ground truth being available."
|
||||
},
|
||||
"composite": 25,
|
||||
"notFound": true
|
||||
},
|
||||
{
|
||||
"id": "resource-cassandra-001",
|
||||
"category": "resource",
|
||||
"difficulty": "medium",
|
||||
"question": "What Kubernetes resource types does the cassandra chart generate? List each type and count.",
|
||||
"groundTruth": "Service: 2\nStatefulSet: 1\nExternalSecret: 1\nClusterSecretStore: 1\nConfigMap: 1\nSecret: 1\nServiceAccount: 1\nRole: 1\nRoleBinding: 1\nClusterRole: 1\nClusterRoleBinding: 1\nCronJob: 1\nJob: 1",
|
||||
"llmAnswer": "Based on the contracts documentation provided, the cassandra chart generates the following Kubernetes resource types:\n\n- StatefulSet: 1\n- Service: 2\n- CronJob: 1\n- Job: 1\n\nNote: The helm index indicates the cassandra chart contains 14 total resources, but the contracts documentation only details these 5 resources. The remaining 9 resources are not specified in the provided files.",
|
||||
"filesRead": [
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"diagrams/all-contracts.mmd",
|
||||
"reference/system-architecture.md"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/charts/cassandra.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"diagrams/all-contracts.mmd",
|
||||
"reference/system-architecture.md"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 3,
|
||||
"completeness": 1,
|
||||
"precision": 4,
|
||||
"navigation": 2,
|
||||
"notes": "AI correctly identified 4 of 13 resource types with accurate counts, but missed 9 types entirely (ExternalSecret, ClusterSecretStore, ConfigMap, Secret, ServiceAccount, Role, RoleBinding, ClusterRole, ClusterRoleBinding). The agent acknowledged incomplete documentation access rather than locating comprehensive resource specifications. No hallucinations, but severe coverage gap indicates failure to find authoritative source."
|
||||
},
|
||||
"composite": 50,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "resource-jenkins-001",
|
||||
"category": "resource",
|
||||
"difficulty": "medium",
|
||||
"question": "What Kubernetes resource types does the jenkins chart generate? List each type and count.",
|
||||
"groundTruth": "ConfigMap: 5\nService: 3\nSecret: 3\nIngress: 2\nServiceAccount: 2\nPersistentVolumeClaim: 1\nSecurityGroupPolicy: 1\nPrometheusRule: 1\nBackendConfig: 1\nNetworkPolicy: 1\nPodDisruptionBudget: 1\nPodMonitoring: 1\nRoute: 1\nServiceMonitor: 1\nStatefulSet: 1\nRole: 1\nRoleBinding: 1\nSecretClaim: 1\nPod: 1",
|
||||
"llmAnswer": "NOT_FOUND\n\nThe documentation shows that the jenkins chart (version 5.8.45) generates 29 resources total, but the specific Kubernetes resource types and their individual counts are not detailed in the provided files.",
|
||||
"filesRead": [
|
||||
"reference/helm/index.md",
|
||||
"diagrams/helm-interactions.mmd",
|
||||
"reference/contracts/index.md",
|
||||
"diagrams/all-contracts.mmd"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/index.md",
|
||||
"reference/helm/charts/jenkins.md",
|
||||
"diagrams/helm-interactions.mmd",
|
||||
"reference/contracts/index.md",
|
||||
"diagrams/all-contracts.mmd"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 0,
|
||||
"completeness": 0,
|
||||
"precision": 5,
|
||||
"navigation": 0,
|
||||
"notes": "AI claimed information was not available in provided files, but ground truth demonstrates 19 distinct Kubernetes resource types with specific counts are documented. Agent failed to locate or extract the required data."
|
||||
},
|
||||
"composite": 25,
|
||||
"notFound": true
|
||||
},
|
||||
{
|
||||
"id": "cross-sub-001",
|
||||
"category": "cross-subsystem",
|
||||
"difficulty": "hard",
|
||||
"question": "Which subsystems depend on app-tools, and how heavily (by call+import count)?",
|
||||
"groundTruth": "ipam-tools: 6\ncompute-tools: 4\napp-common: 1",
|
||||
"llmAnswer": "Based on the Cross-Subsystem Dependencies table in the system architecture document:\n\n- **ipam-tools**: 6 calls + 0 imports = 6 total\n- **compute-tools**: 4 calls + 0 imports = 4 total\n- **app-common**: 1 call + 0 imports = 1 total",
|
||||
"filesRead": [
|
||||
"reference/subsystems/app-tools.md",
|
||||
"reference/contracts/index.md",
|
||||
"reference/system-architecture.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"diagrams/all-contracts.mmd"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/subsystems/app-tools.md",
|
||||
"reference/contracts/index.md",
|
||||
"reference/system-architecture.md",
|
||||
"reference/subsystems/app-common.md",
|
||||
"diagrams/all-contracts.mmd"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 5,
|
||||
"completeness": 5,
|
||||
"precision": 5,
|
||||
"navigation": 5,
|
||||
"notes": "AI answer matches ground truth exactly on all three subsystems with correct dependency counts (ipam-tools: 6, compute-tools: 4, app-common: 1). The breakdown into calls and imports adds useful detail without introducing errors. Proper source attribution demonstrates successful navigation to the architecture documentation."
|
||||
},
|
||||
"composite": 100,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "arch-002",
|
||||
"category": "architectural",
|
||||
"difficulty": "easy",
|
||||
"question": "What is the current version and appVersion of the mdm-app Helm chart?",
|
||||
"groundTruth": "version: 0.7.4, appVersion: 2025.12-12-develop",
|
||||
"llmAnswer": "NOT_FOUND\n\nThe documentation provided shows the mdm-app Helm chart version is **0.7.4**, but the appVersion is not included in the reference materials.",
|
||||
"filesRead": [
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md",
|
||||
"reference/system-architecture.md"
|
||||
],
|
||||
"plannedFiles": [
|
||||
"reference/helm/index.md",
|
||||
"reference/helm/charts/",
|
||||
"reference/contracts/index.md",
|
||||
"reference/system-architecture.md"
|
||||
],
|
||||
"score": {
|
||||
"accuracy": 0,
|
||||
"completeness": 0,
|
||||
"precision": 5,
|
||||
"navigation": 0,
|
||||
"notes": "AI agent returned NOT_FOUND despite ground truth showing version 0.7.4 and appVersion 2025.12-12-develop are available. Agent failed to locate the mdm-app Helm chart metadata. No hallucinated data, but complete failure to retrieve required information."
|
||||
},
|
||||
"composite": 25,
|
||||
"notFound": true
|
||||
}
|
||||
]
|
||||
}
|
||||
280
eval-agent.js
Normal file
280
eval-agent.js
Normal file
@@ -0,0 +1,280 @@
|
||||
/**
|
||||
* Eval Track 1: Agent File-Browsing Benchmark
|
||||
*
|
||||
* Spawns a sub-agent with file access to the docs directory.
|
||||
* The agent navigates the tree, reads files, follows cross-references.
|
||||
* Tests whether the doc STRUCTURE is navigable by an AI agent.
|
||||
*
|
||||
* Usage: node eval-agent.js <docs-dir> <questions.json> [output.json]
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const { callLLM } = require('./prose.js');
|
||||
|
||||
/** Simulate an agent browsing the doc tree with file tools */
|
||||
async function agentBrowse(question, docsDir, llmOpts) {
|
||||
// Step 1: Agent sees the directory tree
|
||||
const tree = buildTree(docsDir, '', 3);
|
||||
|
||||
// Step 2: Agent picks which files to read based on the question + tree
|
||||
const planPrompt = `You are an AI agent with access to a documentation directory. You need to answer a question by browsing the file tree and reading specific files.
|
||||
|
||||
FILE TREE:
|
||||
${tree}
|
||||
|
||||
QUESTION: ${question.question}
|
||||
EXPECTED FORMAT: ${question.answerType}
|
||||
|
||||
Based on the file tree, which files should you read to answer this question? List up to 5 file paths (most relevant first). Think about:
|
||||
- Index files that might have summary tables
|
||||
- Specific chart/subsystem docs that match the question topic
|
||||
- Architecture overview docs for system-wide questions
|
||||
|
||||
Respond with ONLY the file paths, one per line. No explanation.`;
|
||||
|
||||
const planRaw = await callLLM(planPrompt, { ...llmOpts, maxTokens: 512, temperature: 0.0 });
|
||||
|
||||
// Parse file paths from plan
|
||||
const plannedFiles = planRaw.split('\n')
|
||||
.map(l => l.trim().replace(/^[-*•]\s*/, '').replace(/`/g, ''))
|
||||
.filter(l => l.length > 0 && !l.startsWith('#'))
|
||||
.slice(0, 5);
|
||||
|
||||
// Step 3: Read the planned files
|
||||
let context = '';
|
||||
const filesRead = [];
|
||||
for (const relPath of plannedFiles) {
|
||||
const absPath = path.join(docsDir, relPath);
|
||||
if (fs.existsSync(absPath)) {
|
||||
try {
|
||||
const content = fs.readFileSync(absPath, 'utf8');
|
||||
// Cap per file at 15K chars
|
||||
const truncated = content.length > 15000 ? content.substring(0, 15000) + '\n... (truncated)' : content;
|
||||
context += `\n=== ${relPath} ===\n${truncated}\n`;
|
||||
filesRead.push(relPath);
|
||||
} catch {}
|
||||
}
|
||||
}
|
||||
|
||||
// Step 4: If the agent found nothing useful, let it try a second pass
|
||||
if (filesRead.length === 0) {
|
||||
// Fallback: read the main index files
|
||||
const fallbacks = ['reference/system-architecture.md', 'reference/helm/index.md'];
|
||||
for (const fb of fallbacks) {
|
||||
const absPath = path.join(docsDir, fb);
|
||||
if (fs.existsSync(absPath)) {
|
||||
const content = fs.readFileSync(absPath, 'utf8');
|
||||
context += `\n=== ${fb} ===\n${content.substring(0, 15000)}\n`;
|
||||
filesRead.push(fb);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Step 5: Agent answers from the files it read
|
||||
const answerPrompt = `You are an AI agent that has browsed a documentation directory to answer a question. Here are the files you read:
|
||||
|
||||
${context}
|
||||
|
||||
QUESTION: ${question.question}
|
||||
EXPECTED FORMAT: ${question.answerType}
|
||||
|
||||
Answer the question using ONLY the information from the files above. If you can't find the answer, say "NOT_FOUND".
|
||||
Be precise and match the expected format.
|
||||
|
||||
Answer:`;
|
||||
|
||||
const answer = await callLLM(answerPrompt, { ...llmOpts, maxTokens: 1024, temperature: 0.0 });
|
||||
|
||||
return { answer, filesRead, plannedFiles };
|
||||
}
|
||||
|
||||
/** Build a directory tree string */
|
||||
function buildTree(dir, prefix, maxDepth) {
|
||||
if (maxDepth <= 0) return '';
|
||||
const lines = [];
|
||||
let entries;
|
||||
try { entries = fs.readdirSync(dir, { withFileTypes: true }).sort((a, b) => a.name.localeCompare(b.name)); } catch { return ''; }
|
||||
|
||||
for (const e of entries) {
|
||||
if (e.name.startsWith('.')) continue;
|
||||
const relPath = prefix ? `${prefix}/${e.name}` : e.name;
|
||||
if (e.isDirectory()) {
|
||||
const childCount = fs.readdirSync(path.join(dir, e.name)).length;
|
||||
lines.push(`${relPath}/ (${childCount} items)`);
|
||||
if (maxDepth > 1) {
|
||||
lines.push(buildTree(path.join(dir, e.name), relPath, maxDepth - 1));
|
||||
}
|
||||
} else {
|
||||
const size = fs.statSync(path.join(dir, e.name)).size;
|
||||
lines.push(`${relPath} (${(size / 1024).toFixed(1)}K)`);
|
||||
}
|
||||
}
|
||||
return lines.filter(l => l).join('\n');
|
||||
}
|
||||
|
||||
/** Score using LLM-as-judge (same as eval.js) */
|
||||
async function scoreAnswer(question, llmAnswer, llmOpts) {
|
||||
const prompt = `You are a strict evaluator scoring an AI agent's answer against ground truth.
|
||||
|
||||
QUESTION: ${question.question}
|
||||
EXPECTED ANSWER TYPE: ${question.answerType}
|
||||
GROUND TRUTH: ${question.answer}
|
||||
AI ANSWER: ${llmAnswer}
|
||||
|
||||
Score on these dimensions (0-5 each):
|
||||
1. ACCURACY: Does the answer contain the correct facts?
|
||||
2. COMPLETENESS: Does it cover all items in the ground truth?
|
||||
3. PRECISION: Is it free of hallucinated or incorrect extra information?
|
||||
4. NAVIGATION: Did the agent demonstrate it could find the right information? (0=couldn't find anything, 5=went straight to the right file)
|
||||
|
||||
If the AI answered "NOT_FOUND", score ACCURACY=0, COMPLETENESS=0, PRECISION=5, NAVIGATION=0.
|
||||
|
||||
Respond in EXACTLY this JSON format:
|
||||
{"accuracy": N, "completeness": N, "precision": N, "navigation": N, "notes": "brief explanation"}`;
|
||||
|
||||
const raw = await callLLM(prompt, { ...llmOpts, maxTokens: 256, temperature: 0.0 });
|
||||
try {
|
||||
const jsonMatch = raw.match(/\{[\s\S]*\}/);
|
||||
if (jsonMatch) return JSON.parse(jsonMatch[0]);
|
||||
} catch {}
|
||||
return { accuracy: 0, completeness: 0, precision: 0, navigation: 0, notes: `Parse error: ${raw.substring(0, 100)}` };
|
||||
}
|
||||
|
||||
/** Run the agent eval */
|
||||
async function runAgentEval(docsDir, questionsPath, llmOpts = {}) {
|
||||
const questionsData = JSON.parse(fs.readFileSync(questionsPath, 'utf8'));
|
||||
const questions = questionsData.questions.filter(q => q.audience.includes('machine'));
|
||||
|
||||
console.log(`Agent Eval: ${questions.length} machine-audience questions`);
|
||||
|
||||
const results = [];
|
||||
let totals = { accuracy: 0, completeness: 0, precision: 0, navigation: 0 };
|
||||
let notFound = 0;
|
||||
|
||||
for (let i = 0; i < questions.length; i++) {
|
||||
const q = questions[i];
|
||||
process.stdout.write(`[${i + 1}/${questions.length}] ${q.id}...`);
|
||||
|
||||
let browseResult;
|
||||
try {
|
||||
browseResult = await agentBrowse(q, docsDir, llmOpts);
|
||||
} catch (err) {
|
||||
browseResult = { answer: `ERROR: ${err.message}`, filesRead: [], plannedFiles: [] };
|
||||
}
|
||||
|
||||
let score;
|
||||
try {
|
||||
score = await scoreAnswer(q, browseResult.answer, llmOpts);
|
||||
} catch (err) {
|
||||
score = { accuracy: 0, completeness: 0, precision: 0, navigation: 0, notes: `Score error: ${err.message}` };
|
||||
}
|
||||
|
||||
const isNotFound = browseResult.answer.includes('NOT_FOUND');
|
||||
if (isNotFound) notFound++;
|
||||
|
||||
for (const k of Object.keys(totals)) totals[k] += score[k];
|
||||
|
||||
const composite = ((score.accuracy + score.completeness + score.precision + score.navigation) / 20 * 100).toFixed(0);
|
||||
console.log(` ${composite}% (A:${score.accuracy} C:${score.completeness} P:${score.precision} N:${score.navigation}) files:${browseResult.filesRead.length}${isNotFound ? ' [NOT_FOUND]' : ''}`);
|
||||
|
||||
results.push({
|
||||
id: q.id,
|
||||
category: q.category,
|
||||
difficulty: q.difficulty,
|
||||
question: q.question,
|
||||
groundTruth: q.answer,
|
||||
llmAnswer: browseResult.answer,
|
||||
filesRead: browseResult.filesRead,
|
||||
plannedFiles: browseResult.plannedFiles,
|
||||
score,
|
||||
composite: Number(composite),
|
||||
notFound: isNotFound,
|
||||
});
|
||||
}
|
||||
|
||||
const n = questions.length;
|
||||
const report = {
|
||||
evalType: 'agent',
|
||||
timestamp: new Date().toISOString(),
|
||||
docsDir,
|
||||
totalQuestions: n,
|
||||
overallScore: ((Object.values(totals).reduce((a, b) => a + b, 0)) / (n * 20) * 100).toFixed(1),
|
||||
avgAccuracy: (totals.accuracy / n).toFixed(2),
|
||||
avgCompleteness: (totals.completeness / n).toFixed(2),
|
||||
avgPrecision: (totals.precision / n).toFixed(2),
|
||||
avgNavigation: (totals.navigation / n).toFixed(2),
|
||||
notFoundCount: notFound,
|
||||
notFoundRate: ((notFound / n) * 100).toFixed(1) + '%',
|
||||
byCategory: {},
|
||||
byDifficulty: {},
|
||||
results,
|
||||
};
|
||||
|
||||
// Aggregate by category and difficulty
|
||||
for (const r of results) {
|
||||
for (const groupKey of ['category', 'difficulty']) {
|
||||
const group = groupKey === 'category' ? report.byCategory : report.byDifficulty;
|
||||
const key = r[groupKey];
|
||||
if (!group[key]) group[key] = { total: 0, count: 0 };
|
||||
group[key].total += r.composite;
|
||||
group[key].count++;
|
||||
}
|
||||
}
|
||||
for (const group of [report.byCategory, report.byDifficulty]) {
|
||||
for (const [k, v] of Object.entries(group)) {
|
||||
group[k] = { avg: (v.total / v.count).toFixed(1), count: v.count };
|
||||
}
|
||||
}
|
||||
|
||||
return report;
|
||||
}
|
||||
|
||||
if (require.main === module) {
|
||||
const docsDir = process.argv[2];
|
||||
const questionsPath = process.argv[3];
|
||||
const outPath = process.argv[4] || './eval-agent-report.json';
|
||||
|
||||
if (!docsDir || !questionsPath) {
|
||||
console.error('Usage: node eval-agent.js <docs-dir> <questions.json> [output.json]');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const model = process.env.LLM_MODEL || 'claude-haiku-4.5';
|
||||
console.log(`Using model: ${model}`);
|
||||
|
||||
(async () => {
|
||||
try {
|
||||
const report = await runAgentEval(docsDir, questionsPath, { model });
|
||||
|
||||
console.log('\n' + '═'.repeat(60));
|
||||
console.log('AGENT EVAL REPORT');
|
||||
console.log('═'.repeat(60));
|
||||
console.log(`Overall Score: ${report.overallScore}%`);
|
||||
console.log(`Accuracy: ${report.avgAccuracy}/5 Completeness: ${report.avgCompleteness}/5 Precision: ${report.avgPrecision}/5 Navigation: ${report.avgNavigation}/5`);
|
||||
console.log(`Not Found: ${report.notFoundCount}/${report.totalQuestions} (${report.notFoundRate})`);
|
||||
console.log('\nBy Category:');
|
||||
for (const [cat, s] of Object.entries(report.byCategory)) {
|
||||
console.log(` ${cat}: ${s.avg}% (${s.count} questions)`);
|
||||
}
|
||||
console.log('\nBy Difficulty:');
|
||||
for (const [diff, s] of Object.entries(report.byDifficulty)) {
|
||||
console.log(` ${diff}: ${s.avg}% (${s.count} questions)`);
|
||||
}
|
||||
|
||||
const worst = [...report.results].sort((a, b) => a.composite - b.composite).slice(0, 5);
|
||||
console.log('\nWeakest:');
|
||||
for (const w of worst) {
|
||||
console.log(` [${w.id}] ${w.composite}% — ${w.question.substring(0, 70)}... (read: ${w.filesRead.join(', ') || 'none'})`);
|
||||
}
|
||||
|
||||
fs.writeFileSync(outPath, JSON.stringify(report, null, 2));
|
||||
console.log(`\nFull report: ${outPath}`);
|
||||
} catch (err) {
|
||||
console.error('Agent eval failed:', err);
|
||||
process.exit(1);
|
||||
}
|
||||
})();
|
||||
}
|
||||
|
||||
module.exports = { runAgentEval };
|
||||
779
eval-human-report.json
Normal file
779
eval-human-report.json
Normal file
@@ -0,0 +1,779 @@
|
||||
{
|
||||
"evalType": "human",
|
||||
"timestamp": "2026-03-09T23:55:39.520Z",
|
||||
"docsDir": "./foxtrot-docs",
|
||||
"totalQuestions": 28,
|
||||
"overallScore": "63.9",
|
||||
"avgClarity": "3.29",
|
||||
"avgActionability": "2.82",
|
||||
"avgCompleteness": "3.21",
|
||||
"avgFindability": "3.46",
|
||||
"notFoundCount": 5,
|
||||
"notFoundRate": "17.9%",
|
||||
"byCategory": {
|
||||
"structural": {
|
||||
"avg": "63.8",
|
||||
"count": 4
|
||||
},
|
||||
"dependency": {
|
||||
"avg": "20.0",
|
||||
"count": 5
|
||||
},
|
||||
"contract": {
|
||||
"avg": "85.0",
|
||||
"count": 1
|
||||
},
|
||||
"configuration": {
|
||||
"avg": "67.0",
|
||||
"count": 5
|
||||
},
|
||||
"interaction": {
|
||||
"avg": "95.0",
|
||||
"count": 2
|
||||
},
|
||||
"resource": {
|
||||
"avg": "91.3",
|
||||
"count": 4
|
||||
},
|
||||
"cross-subsystem": {
|
||||
"avg": "65.0",
|
||||
"count": 2
|
||||
},
|
||||
"architectural": {
|
||||
"avg": "87.5",
|
||||
"count": 2
|
||||
},
|
||||
"scenario": {
|
||||
"avg": "51.7",
|
||||
"count": 3
|
||||
}
|
||||
},
|
||||
"byDifficulty": {
|
||||
"easy": {
|
||||
"avg": "64.0",
|
||||
"count": 10
|
||||
},
|
||||
"medium": {
|
||||
"avg": "58.1",
|
||||
"count": 13
|
||||
},
|
||||
"hard": {
|
||||
"avg": "79.0",
|
||||
"count": 5
|
||||
}
|
||||
},
|
||||
"results": [
|
||||
{
|
||||
"id": "struct-001",
|
||||
"category": "structural",
|
||||
"difficulty": "easy",
|
||||
"question": "How many Helm charts are defined across the Foxtrot monorepo?",
|
||||
"groundTruth": "76",
|
||||
"humanAnswer": "76",
|
||||
"docsUsed": [
|
||||
"reference/system-architecture.md",
|
||||
"reference/helm/index.md"
|
||||
],
|
||||
"score": {
|
||||
"clarity": 5,
|
||||
"actionability": 2,
|
||||
"completeness": 3,
|
||||
"findability": 5,
|
||||
"notes": "Reader extracted correct answer from well-organized reference docs. Helm index and system architecture are intuitive locations. However, knowing the chart count alone provides minimal actionability—engineers need chart names, purposes, and dependencies to act. Docs appear to contain the metric but likely lack operational context."
|
||||
},
|
||||
"composite": 75,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "struct-002",
|
||||
"category": "structural",
|
||||
"difficulty": "medium",
|
||||
"question": "Which 5 Helm charts produce the most Kubernetes resources? List them with their resource counts.",
|
||||
"groundTruth": "cluster (app-common/apps-provider/addons/charts/common/cluster): 55\ncluster (app-common/charts/cluster): 32\njenkins (app-common/charts/jenkins): 29\nkube-state-metrics (app-common/charts/otel-collector/charts/kube-state-metrics): 27\nfoxtrot-eks (compute-common/templates/foxtrot-eks): 24",
|
||||
"humanAnswer": "1. cluster (`app-common/apps-provider/addons/charts/common/cluster`) - 55 resources\n2. cluster (`app-common/charts/cluster`) - 32 resources\n3. jenkins - 29 resources\n4. kube-state-metrics - 27 resources\n5. foxtrot-eks - 24 resources",
|
||||
"docsUsed": [
|
||||
"reference/system-architecture.md",
|
||||
"reference/helm/index.md"
|
||||
],
|
||||
"score": {
|
||||
"clarity": 4,
|
||||
"actionability": 4,
|
||||
"completeness": 5,
|
||||
"findability": 4,
|
||||
"notes": "Reader extracted all 5 charts with exact resource counts correctly. Documentation was clear enough to find complete information, though it required consulting two reference files rather than a single authoritative source. Data is actionable for capacity planning but lacks context on implications or recommended actions."
|
||||
},
|
||||
"composite": 85,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "struct-003",
|
||||
"category": "structural",
|
||||
"difficulty": "easy",
|
||||
"question": "How many subsystems does the Foxtrot codebase contain?",
|
||||
"groundTruth": "11",
|
||||
"humanAnswer": "12",
|
||||
"docsUsed": [
|
||||
"reference/system-architecture.md",
|
||||
"reference/subsystems/account-common.md",
|
||||
"reference/subsystems/app-tools.md",
|
||||
"reference/subsystems/compute-common.md",
|
||||
"reference/subsystems/compute-tools.md",
|
||||
"reference/subsystems/control-core.md",
|
||||
"reference/subsystems/ipam-core.md",
|
||||
"reference/subsystems/ipam-tools.md",
|
||||
"reference/subsystems/network-common.md",
|
||||
"reference/subsystems/network-core.md",
|
||||
"reference/subsystems/root.md"
|
||||
],
|
||||
"score": {
|
||||
"clarity": 2,
|
||||
"actionability": 2,
|
||||
"completeness": 2,
|
||||
"findability": 4,
|
||||
"notes": "Reader found the right documentation in logical locations but arrived at incorrect count (12 vs 11). Suggests system-architecture.md lacks explicit total count or subsystem definitions are ambiguous. Reader consulted 11 subsystem files yet answered 12, indicating either miscounting or unclear delineation of what constitutes a subsystem. Documentation structure is discoverable, but the core metric is not clearly stated."
|
||||
},
|
||||
"composite": 50,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "struct-004",
|
||||
"category": "structural",
|
||||
"difficulty": "easy",
|
||||
"question": "Which subsystem contains the most files, and how many?",
|
||||
"groundTruth": "runtime: 16309 files",
|
||||
"humanAnswer": "The **runtime** subsystem contains the most files with **16,309** files.",
|
||||
"docsUsed": [
|
||||
"reference/system-architecture.md",
|
||||
"reference/subsystems/account-common.md",
|
||||
"reference/subsystems/app-tools.md",
|
||||
"reference/subsystems/compute-common.md",
|
||||
"reference/subsystems/compute-tools.md",
|
||||
"reference/subsystems/control-core.md",
|
||||
"reference/subsystems/ipam-core.md",
|
||||
"reference/subsystems/ipam-tools.md",
|
||||
"reference/subsystems/network-common.md",
|
||||
"reference/subsystems/network-core.md",
|
||||
"reference/subsystems/root.md"
|
||||
],
|
||||
"score": {
|
||||
"clarity": 3,
|
||||
"actionability": 2,
|
||||
"completeness": 2,
|
||||
"findability": 2,
|
||||
"notes": "Reader found correct answer, but runtime.md is absent from consulted docs despite being the largest subsystem. Information likely buried in system-architecture.md rather than in dedicated runtime subsystem documentation. Documentation structure is incomplete—runtime should have its own reference file like other subsystems (account-common, compute-core, network-core, etc.). Metric is informational only, not actionable for engineers."
|
||||
},
|
||||
"composite": 45,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "dep-external-dns-001",
|
||||
"category": "dependency",
|
||||
"difficulty": "medium",
|
||||
"question": "What are the dependencies of the \"external-dns\" chart (at app-common/apps-provider/addons/charts/common/external-dns)?",
|
||||
"groundTruth": "external-dns (1.17.0) [condition: external-dns.enabled]",
|
||||
"humanAnswer": "Looking at the Helm Charts index table in `reference/helm/index.md`, the external-dns chart at `app-common/apps-provider/addons/charts/common/external-dns` shows:\n\n| Chart | Path | Version | Resources | Dependencies | Interactions |\n|---|---|---|---|---|---|\n| external-dns | `app-common/apps-provider/addons/charts/common/external-dns` | 1.17.0 | 0 | **1** | 0 |\n\nThe table indicates it has **1 dependency**, but the specific name of that dependency is not listed in the provided documentation.\n\n**Answer: NOT_FOUND** (the count is documented as 1, but the specific dependency name is not provided in these docs)",
|
||||
"docsUsed": [
|
||||
"reference/system-architecture.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/account-common.md",
|
||||
"reference/subsystems/app-tools.md",
|
||||
"reference/subsystems/compute-common.md",
|
||||
"reference/subsystems/compute-tools.md",
|
||||
"reference/subsystems/ipam-core.md",
|
||||
"reference/subsystems/ipam-tools.md",
|
||||
"reference/subsystems/network-common.md",
|
||||
"reference/subsystems/network-core.md",
|
||||
"reference/subsystems/root.md"
|
||||
],
|
||||
"score": {
|
||||
"clarity": 0,
|
||||
"actionability": 0,
|
||||
"completeness": 0,
|
||||
"findability": 0,
|
||||
"notes": "Documentation shows dependency count (1) but omits the actual dependency name and version. The table structure indicates dependencies exist without providing the essential details needed to understand or act on them. An engineer cannot determine what external-dns depends on from the provided documentation."
|
||||
},
|
||||
"composite": 0,
|
||||
"notFound": true
|
||||
},
|
||||
{
|
||||
"id": "dep-ingress-nginx-001",
|
||||
"category": "dependency",
|
||||
"difficulty": "medium",
|
||||
"question": "What are the dependencies of the \"ingress-nginx\" chart (at app-common/apps-provider/addons/charts/common/ingress-nginx)?",
|
||||
"groundTruth": "ingress-nginx (4.12.0) [condition: ingress-nginx.enabled]",
|
||||
"humanAnswer": "NOT_FOUND\n\nThe documentation shows that the ingress-nginx chart at `app-common/apps-provider/addons/charts/common/ingress-nginx` has 1 dependency (from the Helm Charts index table), but the specific dependency is not listed in the provided documentation.",
|
||||
"docsUsed": [
|
||||
"reference/system-architecture.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/account-common.md",
|
||||
"reference/subsystems/app-tools.md",
|
||||
"reference/subsystems/compute-common.md",
|
||||
"reference/subsystems/compute-tools.md",
|
||||
"reference/subsystems/ipam-core.md",
|
||||
"reference/subsystems/ipam-tools.md",
|
||||
"reference/subsystems/network-common.md",
|
||||
"reference/subsystems/network-core.md",
|
||||
"reference/subsystems/root.md"
|
||||
],
|
||||
"score": {
|
||||
"clarity": 0,
|
||||
"actionability": 0,
|
||||
"completeness": 0,
|
||||
"findability": 0,
|
||||
"notes": "Reader could not locate the ingress-nginx dependency information despite it being referenced in the Helm Charts index. The documentation indicates a dependency exists but fails to provide the actual dependency details (ingress-nginx 4.12.0 with condition flag). Critical information gap between index reference and actual documentation content."
|
||||
},
|
||||
"composite": 0,
|
||||
"notFound": true
|
||||
},
|
||||
{
|
||||
"id": "dep-kube-state-metrics-001",
|
||||
"category": "dependency",
|
||||
"difficulty": "medium",
|
||||
"question": "What are the dependencies of the \"kube-state-metrics\" chart (at app-common/apps-provider/addons/charts/common/kube-state-metrics)?",
|
||||
"groundTruth": "kube-state-metrics (5.26.0) [condition: kube-state-metrics.enabled]",
|
||||
"humanAnswer": "NOT_FOUND\n\nThe documentation shows that the kube-state-metrics chart at `app-common/apps-provider/addons/charts/common/kube-state-metrics` has 1 dependency (from the Helm Charts index table), but the specific dependency is not listed in the provided documentation. The detailed chart documentation file that would contain this information is not included in the reference materials.",
|
||||
"docsUsed": [
|
||||
"reference/system-architecture.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/account-common.md",
|
||||
"reference/subsystems/app-tools.md",
|
||||
"reference/subsystems/compute-common.md",
|
||||
"reference/subsystems/compute-tools.md",
|
||||
"reference/subsystems/ipam-core.md",
|
||||
"reference/subsystems/ipam-tools.md",
|
||||
"reference/subsystems/network-common.md",
|
||||
"reference/subsystems/network-core.md",
|
||||
"reference/subsystems/root.md"
|
||||
],
|
||||
"score": {
|
||||
"clarity": 0,
|
||||
"actionability": 0,
|
||||
"completeness": 0,
|
||||
"findability": 0,
|
||||
"notes": "Reader could not locate the dependency information. While the Helm index indicates kube-state-metrics has 1 dependency, the actual chart documentation containing the dependency details (kube-state-metrics 5.26.0 with condition flag) was not included in the consulted reference materials. The specific chart documentation file would need to be referenced directly."
|
||||
},
|
||||
"composite": 0,
|
||||
"notFound": true
|
||||
},
|
||||
{
|
||||
"id": "dep-open-telemetry-001",
|
||||
"category": "dependency",
|
||||
"difficulty": "medium",
|
||||
"question": "What are the dependencies of the \"open-telemetry\" chart (at app-common/apps-provider/addons/charts/common/otel-collector)?",
|
||||
"groundTruth": "opentelemetry-operator (0.26.4) [condition: opentelemetry-operator.enabled]\nk8s-metrics (0.1.0) [condition: k8s-metrics.enabled]\nagent (0.1.1) [condition: agent.enabled]\ncommon (0.1.0) [condition: common.enabled]\ngw-metrics (0.1.0) [condition: gw-metrics.enabled]\ngw-traces (0.1.0) [condition: gw-traces.enabled]\nprom-scraper (0.1.0) [condition: prom-scraper.enabled]",
|
||||
"humanAnswer": "\n\nThe \"open-telemetry\" chart at `app-common/apps-provider/addons/charts/common/otel-collector` (version 1.0.1) has the following dependencies:\n\n- opentelemetry-operator (0.26.4)\n- k8s-metrics (0.1.0)\n- agent (0.1.1)\n- common (0.1.0)\n- gw-metrics (0.1.0)\n- gw-traces (0.1.0)\n- prom-scraper (0.1.0)\n\nAll dependencies are conditional and enabled via feature flags in the values configuration.",
|
||||
"docsUsed": [
|
||||
"reference/system-architecture.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/helm/charts/app-common-apps-provider-addons-charts-common-otel-collector-charts-agent.md",
|
||||
"reference/helm/charts/app-common-apps-provider-addons-charts-common-otel-collector-charts-common.md",
|
||||
"reference/helm/charts/app-common-apps-provider-addons-charts-common-otel-collector-charts-gw-metrics.md",
|
||||
"reference/helm/charts/app-common-apps-provider-addons-charts-common-otel-collector-charts-gw-traces.md",
|
||||
"reference/helm/charts/app-common-apps-provider-addons-charts-common-otel-collector-charts-k8s-metrics.md",
|
||||
"reference/helm/charts/app-common-apps-provider-addons-charts-common-otel-collector-charts-prom-scraper.md",
|
||||
"reference/helm/charts/app-common-apps-provider-addons-charts-common-otel-collector.md",
|
||||
"reference/helm/charts/app-common-charts-otel-collector-charts-agent.md",
|
||||
"reference/helm/charts/app-common-charts-otel-collector-charts-common.md",
|
||||
"reference/helm/charts/app-common-charts-otel-collector-charts-gw-metrics.md",
|
||||
"reference/helm/charts/app-common-charts-otel-collector-charts-gw-traces.md",
|
||||
"reference/helm/charts/app-common-charts-otel-collector-charts-k8s-metrics.md",
|
||||
"reference/helm/charts/app-common-charts-otel-collector-charts-kube-state-metrics.md",
|
||||
"reference/helm/charts/app-common-charts-otel-collector-charts-opentelemetry-operator.md",
|
||||
"reference/helm/charts/app-common-charts-otel-collector-charts-prom-scraper.md",
|
||||
"reference/helm/charts/app-common-charts-otel-collector.md",
|
||||
"reference/subsystems/account-common.md",
|
||||
"reference/subsystems/app-tools.md",
|
||||
"reference/subsystems/compute-tools.md",
|
||||
"reference/subsystems/ipam-core.md"
|
||||
],
|
||||
"score": {
|
||||
"clarity": 5,
|
||||
"actionability": 5,
|
||||
"completeness": 5,
|
||||
"findability": 5,
|
||||
"notes": "Reader extracted all 7 dependencies with exact versions and correctly identified the conditional nature via feature flags. Documentation structure with dedicated chart files for each dependency and main chart reference enabled straightforward discovery. Information is immediately actionable for deployment configuration."
|
||||
},
|
||||
"composite": 100,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "dep-reltio-prometheus-001",
|
||||
"category": "dependency",
|
||||
"difficulty": "medium",
|
||||
"question": "What are the dependencies of the \"reltio-prometheus\" chart (at app-common/apps-provider/addons/charts/common/prometheus)?",
|
||||
"groundTruth": "kube-prometheus-stack (69.4.1) [condition: kube-prometheus-stack.enabled]\nprometheus-adapter (4.11.0) [condition: prometheus-adapter.enabled]",
|
||||
"humanAnswer": "Looking at the Helm Charts index in `reference/helm/index.md`, the reltio-prometheus chart entry shows:\n\n| Chart | Path | Version | Resources | Dependencies | Interactions |\n|---|---|---|---|---|---|\n| [reltio-prometheus](charts/app-common-apps-provider-addons-charts-common-prometheus.md) | `app-common/apps-provider/addons/charts/common/prometheus` | 1.0.0 | 0 | 2 | 0 |\n\nThe table indicates the chart has **2 dependencies**, but the specific names of those dependencies are not listed in the provided documentation. The detailed dependency information would be in the referenced file `charts/app-common-apps-provider-addons-charts-common-prometheus.md`, which is not included in these docs.\n\n**Answer: NOT_FOUND** (specific dependency names not documented in provided materials)",
|
||||
"docsUsed": [
|
||||
"reference/system-architecture.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/account-common.md",
|
||||
"reference/subsystems/app-tools.md",
|
||||
"reference/subsystems/compute-common.md",
|
||||
"reference/subsystems/compute-tools.md",
|
||||
"reference/subsystems/ipam-core.md",
|
||||
"reference/subsystems/ipam-tools.md",
|
||||
"reference/subsystems/network-common.md",
|
||||
"reference/subsystems/network-core.md",
|
||||
"reference/subsystems/root.md"
|
||||
],
|
||||
"score": {
|
||||
"clarity": 0,
|
||||
"actionability": 0,
|
||||
"completeness": 0,
|
||||
"findability": 0,
|
||||
"notes": "Reader answered NOT_FOUND. Documentation shows dependency count (2) but omits the actual dependency names, versions, and conditions. The referenced detail file was not provided in the documentation set, making the information inaccessible and unusable."
|
||||
},
|
||||
"composite": 0,
|
||||
"notFound": true
|
||||
},
|
||||
{
|
||||
"id": "contract-001",
|
||||
"category": "contract",
|
||||
"difficulty": "medium",
|
||||
"question": "Which secrets or ConfigMaps are shared across multiple Helm charts? List each with the charts that use it.",
|
||||
"groundTruth": "logdna-agent-key: cluster\nlogdna-agent-tags: cluster\nvault-secret: ai-app, cleanse-app, pms-app, rdm-app, shared-app\nvault-secret-rdm-tasks: ai-app, cleanse-app, pms-app, rdm-app, shared-app",
|
||||
"humanAnswer": "\n\nBased on the documentation, here are the shared secrets and ConfigMaps across multiple Helm charts:\n\n| Secret/ConfigMap | Used By |\n|---|---|\n| `logdna-agent-key` | cluster, cluster |\n| `logdna-agent-tags` | cluster, cluster |\n| `vault-secret` | ai-app, cleanse-app, pms-app, rdm-app, shared-app |\n| `vault-secret-rdm-tasks` | ai-app, cleanse-app, pms-app, rdm-app, shared-app |\n\nThe `vault-secret` and `vault-secret-rdm-tasks` ConfigMaps are the most widely shared, used across five application charts. The logdna secrets are used by the cluster chart (appearing twice, likely indicating multiple uses within that chart).",
|
||||
"docsUsed": [
|
||||
"reference/system-architecture.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md"
|
||||
],
|
||||
"score": {
|
||||
"clarity": 4,
|
||||
"actionability": 4,
|
||||
"completeness": 5,
|
||||
"findability": 4,
|
||||
"notes": "Reader correctly identified all four secrets and their chart associations. Minor issue: misinterpreted logdna secrets as appearing twice within cluster chart when ground truth shows single usage. Docs lack explicit distinction between 'used by' vs 'used multiple times within' a chart, creating ambiguity. All required information present and accessible from appropriate reference files."
|
||||
},
|
||||
"composite": 85,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "config-mdm-app-domain",
|
||||
"category": "configuration",
|
||||
"difficulty": "easy",
|
||||
"question": "What is the default value of \"domain\" in the mdm-app chart?",
|
||||
"groundTruth": "cloud.reltio.com",
|
||||
"humanAnswer": "`cloud.reltio.com`",
|
||||
"docsUsed": [
|
||||
"reference/system-architecture.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/helm/charts/app-common-charts-mdm-app.md"
|
||||
],
|
||||
"score": {
|
||||
"clarity": 5,
|
||||
"actionability": 5,
|
||||
"completeness": 5,
|
||||
"findability": 5,
|
||||
"notes": "Reader extracted exact answer from appropriate Helm chart documentation. Default value was clearly documented and readily discoverable in the mdm-app chart reference file."
|
||||
},
|
||||
"composite": 100,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "config-ai-app-domain",
|
||||
"category": "configuration",
|
||||
"difficulty": "easy",
|
||||
"question": "What is the default value of \"domain\" in the ai-app chart?",
|
||||
"groundTruth": "cloud.reltio.com",
|
||||
"humanAnswer": "`cloud.reltio.com`",
|
||||
"docsUsed": [
|
||||
"reference/system-architecture.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/helm/charts/app-common-charts-ai-app.md"
|
||||
],
|
||||
"score": {
|
||||
"clarity": 5,
|
||||
"actionability": 5,
|
||||
"completeness": 5,
|
||||
"findability": 5,
|
||||
"notes": "Reader successfully located the correct default value. The ai-app chart reference file is the logical location for Helm chart defaults, and the information was presented clearly enough for accurate extraction."
|
||||
},
|
||||
"composite": 100,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "config-elasticsearch-domain",
|
||||
"category": "configuration",
|
||||
"difficulty": "easy",
|
||||
"question": "What is the default value of \"domain\" in the elasticsearch chart?",
|
||||
"groundTruth": "etalon.reltio.com",
|
||||
"humanAnswer": "NOT_FOUND\n\nThe elasticsearch chart documentation lists `domain` as a configuration key with type `unknown` and indicates a default exists (marked as \"yes\"), but the actual default value is not specified in the provided documentation.",
|
||||
"docsUsed": [
|
||||
"reference/system-architecture.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/helm/charts/app-common-apps-provider-addons-charts-state-cluster-elasticsearch.md",
|
||||
"reference/helm/charts/app-common-charts-elasticsearch.md"
|
||||
],
|
||||
"score": {
|
||||
"clarity": 0,
|
||||
"actionability": 0,
|
||||
"completeness": 0,
|
||||
"findability": 0,
|
||||
"notes": "Documentation indicates a default value exists for 'domain' but fails to specify the actual value (etalon.reltio.com). Reader reached the correct chart documentation but found incomplete information, making the answer unfindable despite the right location being consulted."
|
||||
},
|
||||
"composite": 0,
|
||||
"notFound": true
|
||||
},
|
||||
{
|
||||
"id": "config-hazelcast-domain",
|
||||
"category": "configuration",
|
||||
"difficulty": "easy",
|
||||
"question": "What is the default value of \"domain\" in the hazelcast chart?",
|
||||
"groundTruth": "etalon.reltio.com",
|
||||
"humanAnswer": "`internal.reltio.com`",
|
||||
"docsUsed": [
|
||||
"reference/system-architecture.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/helm/charts/app-common-apps-provider-addons-charts-state-cluster-hazelcast.md",
|
||||
"reference/helm/charts/app-common-charts-hazelcast.md"
|
||||
],
|
||||
"score": {
|
||||
"clarity": 2,
|
||||
"actionability": 1,
|
||||
"completeness": 1,
|
||||
"findability": 3,
|
||||
"notes": "Reader located documentation in appropriate files but extracted incorrect default value (internal.reltio.com vs. etalon.reltio.com). Suggests docs either presented wrong default clearly, contained conflicting values, or lacked explicit default designation. Information was findable but inaccurate, making it actionable in the wrong direction—worse than missing entirely."
|
||||
},
|
||||
"composite": 35,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "config-redis-domain",
|
||||
"category": "configuration",
|
||||
"difficulty": "easy",
|
||||
"question": "What is the default value of \"domain\" in the redis chart?",
|
||||
"groundTruth": "internal.reltio.com",
|
||||
"humanAnswer": "`internal.reltio.com`",
|
||||
"docsUsed": [
|
||||
"reference/system-architecture.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/helm/charts/app-common-charts-redis.md"
|
||||
],
|
||||
"score": {
|
||||
"clarity": 5,
|
||||
"actionability": 5,
|
||||
"completeness": 5,
|
||||
"findability": 5,
|
||||
"notes": "Reader located correct answer in appropriate chart reference documentation. The redis chart file is the logical location for this configuration value. Information was sufficiently clear to extract the exact default value without ambiguity."
|
||||
},
|
||||
"composite": 100,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "interaction-001",
|
||||
"category": "interaction",
|
||||
"difficulty": "medium",
|
||||
"question": "Which Helm charts reference Kubernetes services from other components? List each chart and the service it calls.",
|
||||
"groundTruth": "hazelcast (app-common/apps-provider/addons/charts/state-apps/hz) → kubernetes\nhazelcast (app-common/charts/hazelcast) → kubernetes\ngw-metrics (app-common/charts/otel-collector/charts/gw-metrics) → argo-rollouts-metrics\nredis (app-common/charts/redis) → headless",
|
||||
"humanAnswer": "Based on the \"Service-to-Service References\" section in the documentation:\n\n- **hazelcast** → calls `kubernetes` service\n- **hazelcast** → calls `kubernetes` service (appears to be a second instance)\n- **gw-metrics** → calls `argo-rollouts-metrics` service\n- **redis** → calls `headless` service",
|
||||
"docsUsed": [
|
||||
"reference/system-architecture.md",
|
||||
"reference/helm/index.md"
|
||||
],
|
||||
"score": {
|
||||
"clarity": 4,
|
||||
"actionability": 4,
|
||||
"completeness": 5,
|
||||
"findability": 5,
|
||||
"notes": "Reader extracted all four chart-service references correctly from a dedicated 'Service-to-Service References' section. Information is specific and actionable, though context about downstream usage (network policies, service discovery) would enhance actionability. Placement in reference/helm and system-architecture docs aligns with natural search patterns."
|
||||
},
|
||||
"composite": 90,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "interaction-002",
|
||||
"category": "interaction",
|
||||
"difficulty": "hard",
|
||||
"question": "Which network ports are used by multiple Helm charts? List each port and the charts that expose it.",
|
||||
"groundTruth": "Port 443: cluster, twistlock-defender, eck-operator, twistlock\nPort 8889: gw-metrics, gw-traces, kube-state-metrics\nPort 9200: eck-operator, elasticsearch\nPort 13133: agent, gw-metrics, gw-traces, k8s-metrics, prom-scraper, elasticsearch",
|
||||
"humanAnswer": "**Port 443**\n- cluster\n- twistlock-defender\n- eck-operator\n- twistlock\n\n**Port 8889**\n- gw-metrics\n- gw-traces\n- kube-state-metrics\n\n**Port 9200**\n- eck-operator\n- elasticsearch\n\n**Port 13133**\n- agent\n- gw-metrics\n- gw-traces\n- k8s-metrics\n- prom-scraper\n- elasticsearch",
|
||||
"docsUsed": [
|
||||
"reference/system-architecture.md",
|
||||
"reference/helm/index.md"
|
||||
],
|
||||
"score": {
|
||||
"clarity": 5,
|
||||
"actionability": 5,
|
||||
"completeness": 5,
|
||||
"findability": 5,
|
||||
"notes": "Reader extracted exact answer matching ground truth. Documentation presented port-to-chart mappings clearly with proper formatting. All four ports and their associated charts were found and correctly listed. Information was located in logical reference sections (system-architecture, helm index) where engineers would naturally search for network configuration details."
|
||||
},
|
||||
"composite": 100,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "resource-001",
|
||||
"category": "resource",
|
||||
"difficulty": "medium",
|
||||
"question": "What are the 10 most common Kubernetes resource types generated across all Foxtrot Helm charts?",
|
||||
"groundTruth": "ClusterRole: 51\nServiceAccount: 49\nSecret: 43\nExternalSecret: 38\nService: 36\nRole: 30\nClusterRoleBinding: 29\nDeployment: 29\nConfigMap: 25\nRoleBinding: 25",
|
||||
"humanAnswer": "Based on the \"Kubernetes Resource Types (across all charts)\" table in the system architecture documentation, the 10 most common Kubernetes resource types are:\n\n1. ClusterRole — 51\n2. ServiceAccount — 49\n3. Secret — 43\n4. ExternalSecret — 38\n5. Service — 36\n6. Role — 30\n7. ClusterRoleBinding — 29\n8. Deployment — 29\n9. ConfigMap — 25\n10. RoleBinding — 25",
|
||||
"docsUsed": [
|
||||
"reference/system-architecture.md",
|
||||
"reference/helm/index.md"
|
||||
],
|
||||
"score": {
|
||||
"clarity": 5,
|
||||
"actionability": 3,
|
||||
"completeness": 5,
|
||||
"findability": 4,
|
||||
"notes": "Reader extracted exact ground truth from a dedicated table. Information is presented clearly with counts. Docs are in expected locations (system-architecture, helm reference). Actionability limited because data is informational rather than prescriptive—useful for understanding chart composition but lacks guidance on what to do with these metrics."
|
||||
},
|
||||
"composite": 85,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "resource-mdm-app-001",
|
||||
"category": "resource",
|
||||
"difficulty": "medium",
|
||||
"question": "What Kubernetes resource types does the mdm-app chart generate? List each type and count.",
|
||||
"groundTruth": "Secret: 5\nJob: 4\nDeployment: 2\nPersistentVolumeClaim: 2\nService: 2\nExternalSecret: 2\nHorizontalPodAutoscaler: 1\nRollout: 1\nPersistentVolume: 1\nPodDisruptionBudget: 1\nServiceAccount: 1",
|
||||
"humanAnswer": "**Kubernetes Resource Types Generated by mdm-app Chart:**\n\n- HorizontalPodAutoscaler: 1\n- Deployment: 2\n- Rollout: 1\n- Job: 4\n- PersistentVolume: 1\n- PersistentVolumeClaim: 2\n- Service: 2\n- PodDisruptionBudget: 1\n- Secret: 5\n- ExternalSecret: 2\n- ServiceAccount: 1\n\n**Total: 22 resources**",
|
||||
"docsUsed": [
|
||||
"reference/system-architecture.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/helm/charts/app-common-charts-mdm-app.md"
|
||||
],
|
||||
"score": {
|
||||
"clarity": 5,
|
||||
"actionability": 5,
|
||||
"completeness": 5,
|
||||
"findability": 5,
|
||||
"notes": "Reader extracted all 11 resource types with correct counts. Information presented in clear, scannable format. Documentation path (app-common-charts-mdm-app.md) is exactly where an engineer would search for chart-specific resource details. Complete accuracy indicates docs contain all necessary information in accessible form."
|
||||
},
|
||||
"composite": 100,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "resource-cassandra-001",
|
||||
"category": "resource",
|
||||
"difficulty": "medium",
|
||||
"question": "What Kubernetes resource types does the cassandra chart generate? List each type and count.",
|
||||
"groundTruth": "Service: 2\nStatefulSet: 1\nExternalSecret: 1\nClusterSecretStore: 1\nConfigMap: 1\nSecret: 1\nServiceAccount: 1\nRole: 1\nRoleBinding: 1\nClusterRole: 1\nClusterRoleBinding: 1\nCronJob: 1\nJob: 1",
|
||||
"humanAnswer": "**Kubernetes Resource Types Generated by cassandra Chart:**\n\n- Service: 2\n- StatefulSet: 1\n- ExternalSecret: 1\n- ClusterSecretStore: 1\n- ConfigMap: 1\n- Secret: 1\n- ServiceAccount: 1\n- Role: 1\n- RoleBinding: 1\n- ClusterRole: 1\n- ClusterRoleBinding: 1\n- CronJob: 1\n- Job: 1\n\n**Total: 14 resources across 13 resource types**",
|
||||
"docsUsed": [
|
||||
"reference/system-architecture.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/helm/charts/app-common-apps-provider-addons-charts-state-cluster-cassandra.md"
|
||||
],
|
||||
"score": {
|
||||
"clarity": 5,
|
||||
"actionability": 4,
|
||||
"completeness": 5,
|
||||
"findability": 5,
|
||||
"notes": "Reader extracted all 13 resource types with exact counts matching ground truth. Formatting with bullets and bold headers makes data immediately scannable. Documentation path is intuitive for cassandra chart specifics. Actionability is high for verification/planning but limited to informational rather than prescriptive guidance."
|
||||
},
|
||||
"composite": 95,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "resource-jenkins-001",
|
||||
"category": "resource",
|
||||
"difficulty": "medium",
|
||||
"question": "What Kubernetes resource types does the jenkins chart generate? List each type and count.",
|
||||
"groundTruth": "ConfigMap: 5\nService: 3\nSecret: 3\nIngress: 2\nServiceAccount: 2\nPersistentVolumeClaim: 1\nSecurityGroupPolicy: 1\nPrometheusRule: 1\nBackendConfig: 1\nNetworkPolicy: 1\nPodDisruptionBudget: 1\nPodMonitoring: 1\nRoute: 1\nServiceMonitor: 1\nStatefulSet: 1\nRole: 1\nRoleBinding: 1\nSecretClaim: 1\nPod: 1",
|
||||
"humanAnswer": "Based on the Jenkins chart documentation, here are the Kubernetes resource types generated:\n\n- **ConfigMap**: 4\n- **Service**: 3\n- **Secret**: 3\n- **Ingress**: 2\n- **ServiceAccount**: 2\n- **PersistentVolumeClaim**: 1\n- **SecurityGroupPolicy**: 1\n- **PrometheusRule**: 1\n- **BackendConfig**: 1\n- **NetworkPolicy**: 1\n- **PodDisruptionBudget**: 1\n- **PodMonitoring**: 1\n- **Route**: 1\n- **ServiceMonitor**: 1\n- **StatefulSet**: 1\n- **Role**: 1\n- **RoleBinding**: 1\n- **SecretClaim**: 1\n- **Pod**: 1",
|
||||
"docsUsed": [
|
||||
"reference/system-architecture.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/helm/charts/app-common-charts-jenkins.md"
|
||||
],
|
||||
"score": {
|
||||
"clarity": 4,
|
||||
"actionability": 4,
|
||||
"completeness": 4,
|
||||
"findability": 5,
|
||||
"notes": "Reader successfully identified all 19 resource types and got 18/19 counts correct. ConfigMap count error (4 vs 5) indicates minor clarity issue—documentation may not have made this count explicit enough. Docs were in the expected location and contained all necessary information. Error is small enough that it doesn't significantly impact actionability, but precision matters for resource validation."
|
||||
},
|
||||
"composite": 85,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "cross-sub-001",
|
||||
"category": "cross-subsystem",
|
||||
"difficulty": "hard",
|
||||
"question": "Which subsystems depend on app-tools, and how heavily (by call+import count)?",
|
||||
"groundTruth": "ipam-tools: 6\ncompute-tools: 4\napp-common: 1",
|
||||
"humanAnswer": "Based on the Cross-Subsystem Dependencies table in the system-architecture.md documentation:\n\n**Subsystems that depend on app-tools:**\n\n1. **ipam-tools** — 6 calls + 0 imports = **6 total**\n2. **compute-tools** — 4 calls + 0 imports = **4 total**\n3. **app-common** — 1 call + 0 imports = **1 total**\n\nipam-tools is the heaviest consumer of app-tools, followed by compute-tools. All dependencies are call-based rather than import-based.",
|
||||
"docsUsed": [
|
||||
"reference/system-architecture.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/account-common.md",
|
||||
"reference/subsystems/app-tools.md",
|
||||
"reference/subsystems/compute-common.md",
|
||||
"reference/subsystems/compute-tools.md",
|
||||
"reference/subsystems/ipam-core.md",
|
||||
"reference/subsystems/ipam-tools.md",
|
||||
"reference/subsystems/network-common.md",
|
||||
"reference/subsystems/network-core.md",
|
||||
"reference/subsystems/root.md"
|
||||
],
|
||||
"score": {
|
||||
"clarity": 5,
|
||||
"actionability": 4,
|
||||
"completeness": 5,
|
||||
"findability": 5,
|
||||
"notes": "Reader extracted exact dependency counts matching ground truth with clear structural presentation. Cross-Subsystem Dependencies table in system-architecture.md is the logical location for this information. Actionability is 4 rather than 5 because the response lacks context about *why* these dependencies exist or their operational impact, which would be needed for architectural decisions."
|
||||
},
|
||||
"composite": 95,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "cross-sub-002",
|
||||
"category": "cross-subsystem",
|
||||
"difficulty": "easy",
|
||||
"question": "Which subsystems are identified as cross-cutting concerns?",
|
||||
"groundTruth": "app-tools",
|
||||
"humanAnswer": "Based on the documentation, specifically the system architecture overview and the subsystems table:\n\n**root**\n\nThis is the only subsystem identified as a cross-cutting concern. All other subsystems (account-common, app-common, app-tools, compute-common, compute-tools, control-core, ipam-core, ipam-tools, network-common, network-core, and runtime) are classified as domain subsystems.",
|
||||
"docsUsed": [
|
||||
"reference/system-architecture.md",
|
||||
"reference/subsystems/account-common.md",
|
||||
"reference/subsystems/app-tools.md",
|
||||
"reference/subsystems/compute-common.md",
|
||||
"reference/subsystems/compute-tools.md",
|
||||
"reference/subsystems/control-core.md",
|
||||
"reference/subsystems/ipam-core.md",
|
||||
"reference/subsystems/ipam-tools.md",
|
||||
"reference/subsystems/network-common.md",
|
||||
"reference/subsystems/network-core.md",
|
||||
"reference/subsystems/root.md"
|
||||
],
|
||||
"score": {
|
||||
"clarity": 1,
|
||||
"actionability": 1,
|
||||
"completeness": 2,
|
||||
"findability": 3,
|
||||
"notes": "Reader consulted correct files but extracted wrong answer (root instead of app-tools). Documentation failed to clearly communicate that app-tools is a cross-cutting concern. Despite accessing reference/subsystems/app-tools.md, the classification was either ambiguous, contradictory, or buried. This creates actionability risk—engineers would implement incorrect architecture based on this misclassification."
|
||||
},
|
||||
"composite": 35,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "arch-001",
|
||||
"category": "architectural",
|
||||
"difficulty": "hard",
|
||||
"question": "The following subsystems have 0 detected functions and 0 modules: account-common, network-common, network-core. Why might this be the case, and what do they actually contain?",
|
||||
"groundTruth": "These subsystems primarily contain Helm charts with Go-templated YAML, Terraform HCL, and Crossplane compositions. The code analysis pipeline detects functions/modules from Python, Go, TypeScript, and shell scripts — but Helm templates use Go template syntax ({{ }}) which doesn't produce traditional function/module entities. Their content is captured through the Helm chart extraction phase instead.",
|
||||
"humanAnswer": "These subsystems contain configuration, data structures, and schemas rather than executable code, which explains the 0 functions and 0 modules.\n\n**account-common** holds Helm chart definitions and values files—foundational data structures and constants for account operations without executable logic. Its eight files are YAML-based chart configurations.\n\n**network-common** provides Helm chart templates and values for network infrastructure across cloud providers (AWS VPC, Azure VNet, GCP VPC). It establishes shared constants, types, and schemas through declarative configuration files rather than code.\n\n**network-core** contains Terraform modules and configuration files that define networking abstractions and domain models. Its 87 files include `.tf` files, YAML manifests, and lock files—all configuration-driven infrastructure definitions rather than behavioral code.\n\nThe documentation explicitly describes these as \"configuration-centric\" or \"data-driven\" architectures. They serve as reference layers and stable foundations that other subsystems depend on for shared definitions, but they don't expose functions or modules because they're composed of declarative infrastructure-as-code and configuration files, not executable functions.",
|
||||
"docsUsed": [
|
||||
"reference/system-architecture.md",
|
||||
"reference/subsystems/account-common.md",
|
||||
"reference/subsystems/app-tools.md",
|
||||
"reference/subsystems/compute-common.md",
|
||||
"reference/subsystems/compute-tools.md",
|
||||
"reference/subsystems/control-core.md",
|
||||
"reference/subsystems/ipam-core.md",
|
||||
"reference/subsystems/ipam-tools.md",
|
||||
"reference/subsystems/network-common.md",
|
||||
"reference/subsystems/network-core.md",
|
||||
"reference/subsystems/root.md"
|
||||
],
|
||||
"score": {
|
||||
"clarity": 4,
|
||||
"actionability": 3,
|
||||
"completeness": 3,
|
||||
"findability": 5,
|
||||
"notes": "Reader correctly inferred the answer by understanding these are configuration-centric subsystems, but documentation doesn't explicitly state that the analysis pipeline only detects functions/modules from Python/Go/TypeScript/shell—not Helm templates or Terraform. Findability is excellent (subsystem docs in expected locations), but completeness gaps exist: Crossplane compositions mentioned in ground truth aren't reflected in reader's answer, and the pipeline's language limitations aren't documented as the root cause. Actionability is moderate—engineers understand what's there but lack explicit guidance on implications for code analysis or integration patterns."
|
||||
},
|
||||
"composite": 75,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "arch-002",
|
||||
"category": "architectural",
|
||||
"difficulty": "easy",
|
||||
"question": "What is the current version and appVersion of the mdm-app Helm chart?",
|
||||
"groundTruth": "version: 0.7.4, appVersion: 2025.12-12-develop",
|
||||
"humanAnswer": "**Version:** 0.7.4 \n**App Version:** 2025.12-12-develop",
|
||||
"docsUsed": [
|
||||
"reference/system-architecture.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/helm/charts/app-common-charts-mdm-app.md"
|
||||
],
|
||||
"score": {
|
||||
"clarity": 5,
|
||||
"actionability": 5,
|
||||
"completeness": 5,
|
||||
"findability": 5,
|
||||
"notes": "Reader extracted exact version values with clear formatting. Information is immediately usable for deployment. Helm chart versions are logically located in the dedicated chart reference file. No gaps in documentation coverage."
|
||||
},
|
||||
"composite": 100,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "scenario-001",
|
||||
"category": "scenario",
|
||||
"difficulty": "hard",
|
||||
"question": "If you need to rotate the \"vault-secret\" shared secret, which Helm charts would be affected and need redeployment?",
|
||||
"groundTruth": "ai-app, cleanse-app, pms-app, rdm-app, shared-app",
|
||||
"humanAnswer": "Based on the \"Shared Secrets & ConfigMaps\" table in the Helm Charts documentation, the following charts use the \"vault-secret\" shared secret and would require redeployment upon rotation:\n\n- ai-app\n- cleanse-app\n- pms-app\n- rdm-app\n- shared-app",
|
||||
"docsUsed": [
|
||||
"reference/system-architecture.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/contracts/index.md"
|
||||
],
|
||||
"score": {
|
||||
"clarity": 4,
|
||||
"actionability": 3,
|
||||
"completeness": 5,
|
||||
"findability": 4,
|
||||
"notes": "Reader correctly identified all five affected charts using a dedicated 'Shared Secrets & ConfigMaps' table in the Helm documentation. Information was clear and complete for identifying scope. Actionability limited because docs appear to specify WHICH charts are affected but lack rotation procedures or redeployment steps. Findability strong—reference/helm/index.md is the natural location for this information, though reader needed to consult multiple documents to confirm."
|
||||
},
|
||||
"composite": 80,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "scenario-002",
|
||||
"category": "scenario",
|
||||
"difficulty": "hard",
|
||||
"question": "A new engineer needs to deploy the MDM application stack. What charts, configuration values, and external dependencies should they understand first?",
|
||||
"groundTruth": "Chart: mdm-app (app-common/charts/mdm-app), Version: 0.7.4\nKey values: clusterName, environmentType, arch, cloudProvider, region, vaultEnv, imagePullPolicy, clusterDomain, terminationGracePeriodSeconds, etalonsubDomain (+28 more)\nResources generated: 22 K8s resources\nInteractions: none detected",
|
||||
"humanAnswer": "To deploy the MDM application stack, a new engineer should understand:\n\n**Primary Chart**\n- `mdm-app` (v0.7.4) located at `app-common/charts/mdm-app` with 22 Kubernetes resources. The chart declares zero dependencies, but this is misleading operationally.\n\n**Configuration Values**\nThe chart requires Helm values, though the specific schema isn't detailed in these docs. However, the app-tools subsystem exposes functions that reveal required configuration patterns:\n- Cross-account MDM companion ARN patching (`patch_crossaccount_mdm_companions`)\n- DynamoDB shared DevOps API configuration (`patch_mdm_systemdynamodb_shared_devops_api`)\n- Shared DevOps API ARN mappings (`patch_shared_devops_api_mdm_arns`)\n\n**External Dependencies**\n- **Vault secrets**: MDM consumes `vault-secret` and `vault-secret-rdm-tasks` ConfigMaps/Secrets, shared with ai-app, cleanse-app, pms-app, rdm-app, and shared-app\n- **RDM companion relationship**: MDM operates alongside rdm-app (v0.5.12) as a companion application; the app-tools subsystem includes `get_rdm_mdm_companions` logic indicating they're provisioned together\n- **Root subsystem**: Like all app-common charts, mdm-app depends on root cross-cutting concerns for foundational types and configuration\n\n**Deployment Context**\nMDM deploys within the app-common subsystem (935 files, 28 functions), which provides infrastructure provisioning and lifecycle management. The deployment likely requires running app-tools provisioning scripts to establish vault secrets and cross-account IAM configurations before Helm chart deployment.",
|
||||
"docsUsed": [
|
||||
"reference/system-architecture.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/subsystems/account-common.md",
|
||||
"reference/subsystems/app-tools.md",
|
||||
"reference/subsystems/compute-common.md",
|
||||
"reference/subsystems/compute-tools.md",
|
||||
"reference/subsystems/ipam-core.md",
|
||||
"reference/subsystems/ipam-tools.md",
|
||||
"reference/subsystems/network-common.md",
|
||||
"reference/subsystems/network-core.md",
|
||||
"reference/subsystems/root.md"
|
||||
],
|
||||
"score": {
|
||||
"clarity": 3,
|
||||
"actionability": 2,
|
||||
"completeness": 2,
|
||||
"findability": 2,
|
||||
"notes": "Reader provided well-structured context but diverged from ground truth. Critical gap: the 30+ specific configuration values (clusterName, environmentType, arch, etc.) are not listed or referenced. Instead, reader inferred deployment patterns from app-tools functions, which is valuable but doesn't address the documented values schema. Information scattered across 11 files suggests poor consolidation in helm documentation. New engineer would lack concrete values to configure before deployment."
|
||||
},
|
||||
"composite": 45,
|
||||
"notFound": false
|
||||
},
|
||||
{
|
||||
"id": "scenario-003",
|
||||
"category": "scenario",
|
||||
"difficulty": "medium",
|
||||
"question": "What network ports does the Cassandra chart expose, and what are they typically used for?",
|
||||
"groundTruth": "Ports: 7000, 7001, 7199, 9042, 9160\n7000: Cassandra inter-node cluster communication\n7001: Cassandra SSL inter-node\n7199: JMX monitoring\n9042: CQL native transport (client connections)\n9160: Thrift client API (legacy)",
|
||||
"humanAnswer": "The Cassandra chart exposes five network ports via the StatefulSet template:\n\n- Port `7000`\n- Port `7001`\n- Port `7199`\n- Port `9042`\n- Port `9160`\n\nHowever, the documentation doesn't specify what each port is typically used for. The chart definition only lists the port numbers and their source template file (`templates/cassandra-statefulset.yaml`), but doesn't include descriptions of their purposes or protocols.",
|
||||
"docsUsed": [
|
||||
"reference/system-architecture.md",
|
||||
"reference/helm/index.md",
|
||||
"reference/helm/charts/app-common-apps-provider-addons-charts-state-cluster-cassandra.md"
|
||||
],
|
||||
"score": {
|
||||
"clarity": 2,
|
||||
"actionability": 1,
|
||||
"completeness": 1,
|
||||
"findability": 2,
|
||||
"notes": "Documentation lists port numbers but omits their purposes entirely. Reader correctly identified all 5 ports but explicitly noted the absence of functional descriptions. Without knowing what each port does (inter-node communication, JMX, CQL, Thrift), engineers cannot configure networking, firewall rules, or understand cluster architecture. Critical operational context is missing from all consulted docs."
|
||||
},
|
||||
"composite": 30,
|
||||
"notFound": false
|
||||
}
|
||||
]
|
||||
}
|
||||
272
eval-human.js
Normal file
272
eval-human.js
Normal file
@@ -0,0 +1,272 @@
|
||||
/**
|
||||
* Eval Track 2: Human Readability Benchmark
|
||||
*
|
||||
* Tests whether the docs are useful for a human engineer.
|
||||
* Feeds the FULL relevant doc page(s) to the LLM and scores on:
|
||||
* - Clarity: Is the prose understandable?
|
||||
* - Actionability: Could you act on this information?
|
||||
* - Navigation: Does the doc structure guide you to the answer?
|
||||
* - Completeness: Is the answer fully covered?
|
||||
*
|
||||
* Usage: node eval-human.js <docs-dir> <questions.json> [output.json]
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const { callLLM } = require('./prose.js');
|
||||
|
||||
/** Find the most relevant doc files for a human-audience question */
|
||||
function findRelevantDocs(question, docsDir) {
|
||||
const docs = [];
|
||||
|
||||
// Always include the architecture overview
|
||||
const archPath = path.join(docsDir, 'reference/system-architecture.md');
|
||||
if (fs.existsSync(archPath)) {
|
||||
docs.push({ path: 'reference/system-architecture.md', content: fs.readFileSync(archPath, 'utf8') });
|
||||
}
|
||||
|
||||
// Category-specific doc selection
|
||||
const q = question.question.toLowerCase();
|
||||
|
||||
// Helm-related: include helm index
|
||||
if (q.includes('helm') || q.includes('chart') || q.includes('secret') || q.includes('port') || q.includes('deploy') || q.includes('service')) {
|
||||
const helmIndex = path.join(docsDir, 'reference/helm/index.md');
|
||||
if (fs.existsSync(helmIndex)) {
|
||||
docs.push({ path: 'reference/helm/index.md', content: fs.readFileSync(helmIndex, 'utf8') });
|
||||
}
|
||||
}
|
||||
|
||||
// Find specific chart docs mentioned in the question
|
||||
const chartNames = ['mdm-app', 'ai-app', 'elasticsearch', 'hazelcast', 'cassandra', 'kong', 'redis', 'jenkins', 'otel-collector', 'twistlock', 'cluster'];
|
||||
for (const name of chartNames) {
|
||||
if (q.includes(name)) {
|
||||
// Find matching chart doc(s)
|
||||
const chartsDir = path.join(docsDir, 'reference/helm/charts');
|
||||
if (fs.existsSync(chartsDir)) {
|
||||
for (const f of fs.readdirSync(chartsDir)) {
|
||||
if (f.includes(name)) {
|
||||
const content = fs.readFileSync(path.join(chartsDir, f), 'utf8');
|
||||
docs.push({ path: `reference/helm/charts/${f}`, content });
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Subsystem-related
|
||||
if (q.includes('subsystem') || q.includes('cross-cutting') || q.includes('depend')) {
|
||||
const subsDir = path.join(docsDir, 'reference/subsystems');
|
||||
if (fs.existsSync(subsDir)) {
|
||||
for (const f of fs.readdirSync(subsDir)) {
|
||||
const content = fs.readFileSync(path.join(subsDir, f), 'utf8');
|
||||
docs.push({ path: `reference/subsystems/${f}`, content });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Contracts
|
||||
if (q.includes('contract') || q.includes('interface') || q.includes('secret') || q.includes('configmap')) {
|
||||
const contractsPath = path.join(docsDir, 'reference/contracts/index.md');
|
||||
if (fs.existsSync(contractsPath)) {
|
||||
const content = fs.readFileSync(contractsPath, 'utf8');
|
||||
// Truncate if massive
|
||||
docs.push({ path: 'reference/contracts/index.md', content: content.substring(0, 30000) });
|
||||
}
|
||||
}
|
||||
|
||||
// Cap total context at 60K
|
||||
let total = 0;
|
||||
return docs.filter(d => {
|
||||
if (total + d.content.length > 60000) return false;
|
||||
total += d.content.length;
|
||||
return true;
|
||||
});
|
||||
}
|
||||
|
||||
/** Have an LLM simulate a human reading the docs */
|
||||
async function humanRead(question, docs, llmOpts) {
|
||||
const context = docs.map(d => `\n=== ${d.path} ===\n${d.content}`).join('\n');
|
||||
|
||||
const prompt = `You are a new infrastructure engineer who just joined the Foxtrot team. You've been given documentation to read. Answer the following question as if you're reading these docs for the first time.
|
||||
|
||||
DOCUMENTATION:
|
||||
${context}
|
||||
|
||||
QUESTION: ${question.question}
|
||||
|
||||
Instructions:
|
||||
- Read the docs carefully, as a human would
|
||||
- If the answer requires combining info from multiple sections, do so
|
||||
- If the docs don't cover this, say "NOT_FOUND"
|
||||
- Match the expected format: ${question.answerType}
|
||||
|
||||
Answer:`;
|
||||
|
||||
return callLLM(prompt, { ...llmOpts, maxTokens: 1024, temperature: 0.1 });
|
||||
}
|
||||
|
||||
/** Score with human-focused criteria */
|
||||
async function scoreHuman(question, answer, docsUsed, llmOpts) {
|
||||
const prompt = `You are evaluating documentation quality from a HUMAN reader's perspective.
|
||||
|
||||
QUESTION: ${question.question}
|
||||
GROUND TRUTH: ${question.answer}
|
||||
READER'S ANSWER: ${answer}
|
||||
DOCS CONSULTED: ${docsUsed.map(d => d.path).join(', ')}
|
||||
|
||||
Score on these human-centric dimensions (0-5 each):
|
||||
1. CLARITY: Was the information presented clearly enough for a human to extract the answer? (0=confusing/buried, 5=immediately obvious)
|
||||
2. ACTIONABILITY: Could an engineer act on this information? (0=useless, 5=ready to execute)
|
||||
3. COMPLETENESS: Did the docs contain ALL the information needed? (0=missing, 5=fully covered)
|
||||
4. FINDABILITY: Based on the doc paths, would a human naturally look in these files? (0=buried in wrong place, 5=exactly where you'd expect)
|
||||
|
||||
If the reader answered "NOT_FOUND", score CLARITY=0, ACTIONABILITY=0, COMPLETENESS=0, FINDABILITY=0.
|
||||
|
||||
Respond in EXACTLY this JSON format:
|
||||
{"clarity": N, "actionability": N, "completeness": N, "findability": N, "notes": "brief explanation"}`;
|
||||
|
||||
const raw = await callLLM(prompt, { ...llmOpts, maxTokens: 256, temperature: 0.0 });
|
||||
try {
|
||||
const jsonMatch = raw.match(/\{[\s\S]*\}/);
|
||||
if (jsonMatch) return JSON.parse(jsonMatch[0]);
|
||||
} catch {}
|
||||
return { clarity: 0, actionability: 0, completeness: 0, findability: 0, notes: `Parse error: ${raw.substring(0, 100)}` };
|
||||
}
|
||||
|
||||
/** Run the human eval */
|
||||
async function runHumanEval(docsDir, questionsPath, llmOpts = {}) {
|
||||
const questionsData = JSON.parse(fs.readFileSync(questionsPath, 'utf8'));
|
||||
const questions = questionsData.questions.filter(q => q.audience.includes('human'));
|
||||
|
||||
console.log(`Human Eval: ${questions.length} human-audience questions`);
|
||||
|
||||
const results = [];
|
||||
let totals = { clarity: 0, actionability: 0, completeness: 0, findability: 0 };
|
||||
let notFound = 0;
|
||||
|
||||
for (let i = 0; i < questions.length; i++) {
|
||||
const q = questions[i];
|
||||
process.stdout.write(`[${i + 1}/${questions.length}] ${q.id}...`);
|
||||
|
||||
const docs = findRelevantDocs(q, docsDir);
|
||||
|
||||
let answer;
|
||||
try {
|
||||
answer = await humanRead(q, docs, llmOpts);
|
||||
} catch (err) {
|
||||
answer = `ERROR: ${err.message}`;
|
||||
}
|
||||
|
||||
let score;
|
||||
try {
|
||||
score = await scoreHuman(q, answer, docs, llmOpts);
|
||||
} catch (err) {
|
||||
score = { clarity: 0, actionability: 0, completeness: 0, findability: 0, notes: `Score error: ${err.message}` };
|
||||
}
|
||||
|
||||
const isNotFound = answer.includes('NOT_FOUND');
|
||||
if (isNotFound) notFound++;
|
||||
|
||||
for (const k of Object.keys(totals)) totals[k] += (score[k] || 0);
|
||||
|
||||
const composite = ((score.clarity + score.actionability + score.completeness + score.findability) / 20 * 100).toFixed(0);
|
||||
console.log(` ${composite}% (Cl:${score.clarity} Ac:${score.actionability} Co:${score.completeness} Fi:${score.findability}) docs:${docs.length}${isNotFound ? ' [NOT_FOUND]' : ''}`);
|
||||
|
||||
results.push({
|
||||
id: q.id,
|
||||
category: q.category,
|
||||
difficulty: q.difficulty,
|
||||
question: q.question,
|
||||
groundTruth: q.answer,
|
||||
humanAnswer: answer,
|
||||
docsUsed: docs.map(d => d.path),
|
||||
score,
|
||||
composite: Number(composite),
|
||||
notFound: isNotFound,
|
||||
});
|
||||
}
|
||||
|
||||
const n = questions.length;
|
||||
const report = {
|
||||
evalType: 'human',
|
||||
timestamp: new Date().toISOString(),
|
||||
docsDir,
|
||||
totalQuestions: n,
|
||||
overallScore: ((Object.values(totals).reduce((a, b) => a + b, 0)) / (n * 20) * 100).toFixed(1),
|
||||
avgClarity: (totals.clarity / n).toFixed(2),
|
||||
avgActionability: (totals.actionability / n).toFixed(2),
|
||||
avgCompleteness: (totals.completeness / n).toFixed(2),
|
||||
avgFindability: (totals.findability / n).toFixed(2),
|
||||
notFoundCount: notFound,
|
||||
notFoundRate: ((notFound / n) * 100).toFixed(1) + '%',
|
||||
byCategory: {},
|
||||
byDifficulty: {},
|
||||
results,
|
||||
};
|
||||
|
||||
for (const r of results) {
|
||||
for (const groupKey of ['category', 'difficulty']) {
|
||||
const group = groupKey === 'category' ? report.byCategory : report.byDifficulty;
|
||||
const key = r[groupKey];
|
||||
if (!group[key]) group[key] = { total: 0, count: 0 };
|
||||
group[key].total += r.composite;
|
||||
group[key].count++;
|
||||
}
|
||||
}
|
||||
for (const group of [report.byCategory, report.byDifficulty]) {
|
||||
for (const [k, v] of Object.entries(group)) {
|
||||
group[k] = { avg: (v.total / v.count).toFixed(1), count: v.count };
|
||||
}
|
||||
}
|
||||
|
||||
return report;
|
||||
}
|
||||
|
||||
if (require.main === module) {
|
||||
const docsDir = process.argv[2];
|
||||
const questionsPath = process.argv[3];
|
||||
const outPath = process.argv[4] || './eval-human-report.json';
|
||||
|
||||
if (!docsDir || !questionsPath) {
|
||||
console.error('Usage: node eval-human.js <docs-dir> <questions.json> [output.json]');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const model = process.env.LLM_MODEL || 'claude-haiku-4.5';
|
||||
console.log(`Using model: ${model}`);
|
||||
|
||||
(async () => {
|
||||
try {
|
||||
const report = await runHumanEval(docsDir, questionsPath, { model });
|
||||
|
||||
console.log('\n' + '═'.repeat(60));
|
||||
console.log('HUMAN EVAL REPORT');
|
||||
console.log('═'.repeat(60));
|
||||
console.log(`Overall Score: ${report.overallScore}%`);
|
||||
console.log(`Clarity: ${report.avgClarity}/5 Actionability: ${report.avgActionability}/5 Completeness: ${report.avgCompleteness}/5 Findability: ${report.avgFindability}/5`);
|
||||
console.log(`Not Found: ${report.notFoundCount}/${report.totalQuestions} (${report.notFoundRate})`);
|
||||
console.log('\nBy Category:');
|
||||
for (const [cat, s] of Object.entries(report.byCategory)) {
|
||||
console.log(` ${cat}: ${s.avg}% (${s.count} questions)`);
|
||||
}
|
||||
console.log('\nBy Difficulty:');
|
||||
for (const [diff, s] of Object.entries(report.byDifficulty)) {
|
||||
console.log(` ${diff}: ${s.avg}% (${s.count} questions)`);
|
||||
}
|
||||
|
||||
const worst = [...report.results].sort((a, b) => a.composite - b.composite).slice(0, 5);
|
||||
console.log('\nWeakest:');
|
||||
for (const w of worst) {
|
||||
console.log(` [${w.id}] ${w.composite}% — ${w.question.substring(0, 70)}...`);
|
||||
}
|
||||
|
||||
fs.writeFileSync(outPath, JSON.stringify(report, null, 2));
|
||||
console.log(`\nFull report: ${outPath}`);
|
||||
} catch (err) {
|
||||
console.error('Human eval failed:', err);
|
||||
process.exit(1);
|
||||
}
|
||||
})();
|
||||
}
|
||||
|
||||
module.exports = { runHumanEval };
|
||||
Reference in New Issue
Block a user