From 0cc4abcb0f97dc22c15e19518715305bb4f7abf4 Mon Sep 17 00:00:00 2001 From: Jarvis Prime Date: Mon, 9 Mar 2026 23:40:07 +0000 Subject: [PATCH] Phase 9b: structural documentation improvements\n\n- sysdoc.js: Added Summary Statistics, Top Charts, and K8s Resource Types to architecture doc\n- Addresses ratchet failures where system-wide rollups were missing from generated prose\n- Eval v2 shows minor improvement, though RAG context window still limits wide scatter-gather queries --- eval-questions-iter1.json | 419 +++++++++++++++++++++++++++ eval-report-v2.json | 594 ++++++++++++++++++++++++++++++++++++++ ratchet-history.json | 14 + ratchet.js | 396 +++++++++++++++++++++++++ sysdoc.js | 31 +- 5 files changed, 1452 insertions(+), 2 deletions(-) create mode 100644 eval-questions-iter1.json create mode 100644 eval-report-v2.json create mode 100644 ratchet-history.json create mode 100644 ratchet.js diff --git a/eval-questions-iter1.json b/eval-questions-iter1.json new file mode 100644 index 0000000..bb906a8 --- /dev/null +++ b/eval-questions-iter1.json @@ -0,0 +1,419 @@ +{ + "generated": "2026-03-09T23:20:55.589Z", + "count": 33, + "questions": [ + { + "id": "struct-001", + "category": "structural", + "difficulty": "easy", + "audience": [ + "human", + "machine" + ], + "question": "How many Helm charts are defined across the Foxtrot monorepo?", + "answer": "76", + "answerType": "exact", + "source": "Chart.yaml discovery" + }, + { + "id": "struct-002", + "category": "structural", + "difficulty": "medium", + "audience": [ + "human", + "machine" + ], + "question": "Which 5 Helm charts produce the most Kubernetes resources? List them with their resource counts.", + "answer": "cluster (app-common/apps-provider/addons/charts/common/cluster): 55\ncluster (app-common/charts/cluster): 32\njenkins (app-common/charts/jenkins): 29\nkube-state-metrics (app-common/charts/otel-collector/charts/kube-state-metrics): 27\nfoxtrot-eks (compute-common/templates/foxtrot-eks): 24", + "answerType": "ranked-list", + "source": "template scanning" + }, + { + "id": "struct-003", + "category": "structural", + "difficulty": "easy", + "audience": [ + "human", + "machine" + ], + "question": "How many subsystems does the Foxtrot codebase contain?", + "answer": "11", + "answerType": "exact", + "source": "subsystem aggregation" + }, + { + "id": "struct-004", + "category": "structural", + "difficulty": "easy", + "audience": [ + "human" + ], + "question": "Which subsystem contains the most files, and how many?", + "answer": "runtime: 16309 files", + "answerType": "exact", + "source": "subsystem aggregation" + }, + { + "id": "dep-external-dns-001", + "category": "dependency", + "difficulty": "medium", + "audience": [ + "human", + "machine" + ], + "question": "What are the dependencies of the \"external-dns\" chart (at app-common/apps-provider/addons/charts/common/external-dns)?", + "answer": "external-dns (1.17.0) [condition: external-dns.enabled]", + "answerType": "list", + "source": "app-common/apps-provider/addons/charts/common/external-dns/Chart.yaml" + }, + { + "id": "dep-ingress-nginx-001", + "category": "dependency", + "difficulty": "medium", + "audience": [ + "human", + "machine" + ], + "question": "What are the dependencies of the \"ingress-nginx\" chart (at app-common/apps-provider/addons/charts/common/ingress-nginx)?", + "answer": "ingress-nginx (4.12.0) [condition: ingress-nginx.enabled]", + "answerType": "list", + "source": "app-common/apps-provider/addons/charts/common/ingress-nginx/Chart.yaml" + }, + { + "id": "dep-kube-state-metrics-001", + "category": "dependency", + "difficulty": "medium", + "audience": [ + "human", + "machine" + ], + "question": "What are the dependencies of the \"kube-state-metrics\" chart (at app-common/apps-provider/addons/charts/common/kube-state-metrics)?", + "answer": "kube-state-metrics (5.26.0) [condition: kube-state-metrics.enabled]", + "answerType": "list", + "source": "app-common/apps-provider/addons/charts/common/kube-state-metrics/Chart.yaml" + }, + { + "id": "dep-open-telemetry-001", + "category": "dependency", + "difficulty": "medium", + "audience": [ + "human", + "machine" + ], + "question": "What are the dependencies of the \"open-telemetry\" chart (at app-common/apps-provider/addons/charts/common/otel-collector)?", + "answer": "opentelemetry-operator (0.26.4) [condition: opentelemetry-operator.enabled]\nk8s-metrics (0.1.0) [condition: k8s-metrics.enabled]\nagent (0.1.1) [condition: agent.enabled]\ncommon (0.1.0) [condition: common.enabled]\ngw-metrics (0.1.0) [condition: gw-metrics.enabled]\ngw-traces (0.1.0) [condition: gw-traces.enabled]\nprom-scraper (0.1.0) [condition: prom-scraper.enabled]", + "answerType": "list", + "source": "app-common/apps-provider/addons/charts/common/otel-collector/Chart.yaml" + }, + { + "id": "dep-reltio-prometheus-001", + "category": "dependency", + "difficulty": "medium", + "audience": [ + "human", + "machine" + ], + "question": "What are the dependencies of the \"reltio-prometheus\" chart (at app-common/apps-provider/addons/charts/common/prometheus)?", + "answer": "kube-prometheus-stack (69.4.1) [condition: kube-prometheus-stack.enabled]\nprometheus-adapter (4.11.0) [condition: prometheus-adapter.enabled]", + "answerType": "list", + "source": "app-common/apps-provider/addons/charts/common/prometheus/Chart.yaml" + }, + { + "id": "contract-001", + "category": "contract", + "difficulty": "medium", + "audience": [ + "human", + "machine" + ], + "question": "Which secrets or ConfigMaps are shared across multiple Helm charts? List each with the charts that use it.", + "answer": "logdna-agent-key: cluster\nlogdna-agent-tags: cluster\nvault-secret: ai-app, cleanse-app, pms-app, rdm-app, shared-app\nvault-secret-rdm-tasks: ai-app, cleanse-app, pms-app, rdm-app, shared-app", + "answerType": "list", + "source": "template interaction scanning" + }, + { + "id": "config-mdm-app-domain", + "category": "configuration", + "difficulty": "easy", + "audience": [ + "human" + ], + "question": "What is the default value of \"domain\" in the mdm-app chart?", + "answer": "cloud.reltio.com", + "answerType": "exact", + "source": "app-common/charts/mdm-app/values.yaml" + }, + { + "id": "config-mdm-app-count", + "category": "configuration", + "difficulty": "easy", + "audience": [ + "machine" + ], + "question": "How many top-level configuration keys does the mdm-app chart (at app-common/charts/mdm-app) expose in its values.yaml?", + "answer": "38", + "answerType": "exact", + "source": "app-common/charts/mdm-app/values.yaml" + }, + { + "id": "config-ai-app-domain", + "category": "configuration", + "difficulty": "easy", + "audience": [ + "human" + ], + "question": "What is the default value of \"domain\" in the ai-app chart?", + "answer": "cloud.reltio.com", + "answerType": "exact", + "source": "app-common/charts/ai-app/values.yaml" + }, + { + "id": "config-ai-app-count", + "category": "configuration", + "difficulty": "easy", + "audience": [ + "machine" + ], + "question": "How many top-level configuration keys does the ai-app chart (at app-common/charts/ai-app) expose in its values.yaml?", + "answer": "34", + "answerType": "exact", + "source": "app-common/charts/ai-app/values.yaml" + }, + { + "id": "config-elasticsearch-domain", + "category": "configuration", + "difficulty": "easy", + "audience": [ + "human" + ], + "question": "What is the default value of \"domain\" in the elasticsearch chart?", + "answer": "etalon.reltio.com", + "answerType": "exact", + "source": "app-common/apps-provider/addons/charts/state-apps/es/values.yaml" + }, + { + "id": "config-elasticsearch-count", + "category": "configuration", + "difficulty": "easy", + "audience": [ + "machine" + ], + "question": "How many top-level configuration keys does the elasticsearch chart (at app-common/apps-provider/addons/charts/state-apps/es) expose in its values.yaml?", + "answer": "32", + "answerType": "exact", + "source": "app-common/apps-provider/addons/charts/state-apps/es/values.yaml" + }, + { + "id": "config-hazelcast-domain", + "category": "configuration", + "difficulty": "easy", + "audience": [ + "human" + ], + "question": "What is the default value of \"domain\" in the hazelcast chart?", + "answer": "etalon.reltio.com", + "answerType": "exact", + "source": "app-common/apps-provider/addons/charts/state-apps/hz/values.yaml" + }, + { + "id": "config-hazelcast-count", + "category": "configuration", + "difficulty": "easy", + "audience": [ + "machine" + ], + "question": "How many top-level configuration keys does the hazelcast chart (at app-common/apps-provider/addons/charts/state-apps/hz) expose in its values.yaml?", + "answer": "32", + "answerType": "exact", + "source": "app-common/apps-provider/addons/charts/state-apps/hz/values.yaml" + }, + { + "id": "config-redis-domain", + "category": "configuration", + "difficulty": "easy", + "audience": [ + "human" + ], + "question": "What is the default value of \"domain\" in the redis chart?", + "answer": "internal.reltio.com", + "answerType": "exact", + "source": "app-common/charts/redis/values.yaml" + }, + { + "id": "config-redis-count", + "category": "configuration", + "difficulty": "easy", + "audience": [ + "machine" + ], + "question": "How many top-level configuration keys does the redis chart (at app-common/charts/redis) expose in its values.yaml?", + "answer": "23", + "answerType": "exact", + "source": "app-common/charts/redis/values.yaml" + }, + { + "id": "interaction-001", + "category": "interaction", + "difficulty": "medium", + "audience": [ + "human", + "machine" + ], + "question": "Which Helm charts reference Kubernetes services from other components? List each chart and the service it calls.", + "answer": "hazelcast (app-common/apps-provider/addons/charts/state-apps/hz) → kubernetes\nhazelcast (app-common/charts/hazelcast) → kubernetes\ngw-metrics (app-common/charts/otel-collector/charts/gw-metrics) → argo-rollouts-metrics\nredis (app-common/charts/redis) → headless", + "answerType": "list", + "source": "template interaction scanning" + }, + { + "id": "interaction-002", + "category": "interaction", + "difficulty": "hard", + "audience": [ + "human", + "machine" + ], + "question": "Which network ports are used by multiple Helm charts? List each port and the charts that expose it.", + "answer": "Port 443: cluster, twistlock-defender, eck-operator, twistlock\nPort 8889: gw-metrics, gw-traces, kube-state-metrics\nPort 9200: eck-operator, elasticsearch\nPort 13133: agent, gw-metrics, gw-traces, k8s-metrics, prom-scraper, elasticsearch", + "answerType": "list", + "source": "template port scanning" + }, + { + "id": "resource-001", + "category": "resource", + "difficulty": "medium", + "audience": [ + "human", + "machine" + ], + "question": "What are the 10 most common Kubernetes resource types generated across all Foxtrot Helm charts?", + "answer": "ClusterRole: 51\nServiceAccount: 49\nSecret: 43\nExternalSecret: 38\nService: 36\nRole: 30\nClusterRoleBinding: 29\nDeployment: 29\nConfigMap: 25\nRoleBinding: 25", + "answerType": "ranked-list", + "source": "template resource scanning" + }, + { + "id": "resource-mdm-app-001", + "category": "resource", + "difficulty": "medium", + "audience": [ + "human", + "machine" + ], + "question": "What Kubernetes resource types does the mdm-app chart generate? List each type and count.", + "answer": "Secret: 5\nJob: 4\nDeployment: 2\nPersistentVolumeClaim: 2\nService: 2\nExternalSecret: 2\nHorizontalPodAutoscaler: 1\nRollout: 1\nPersistentVolume: 1\nPodDisruptionBudget: 1\nServiceAccount: 1", + "answerType": "list", + "source": "app-common/charts/mdm-app/templates/" + }, + { + "id": "resource-cassandra-001", + "category": "resource", + "difficulty": "medium", + "audience": [ + "human", + "machine" + ], + "question": "What Kubernetes resource types does the cassandra chart generate? List each type and count.", + "answer": "Service: 2\nStatefulSet: 1\nExternalSecret: 1\nClusterSecretStore: 1\nConfigMap: 1\nSecret: 1\nServiceAccount: 1\nRole: 1\nRoleBinding: 1\nClusterRole: 1\nClusterRoleBinding: 1\nCronJob: 1\nJob: 1", + "answerType": "list", + "source": "app-common/apps-provider/addons/charts/state-cluster/cassandra/templates/" + }, + { + "id": "resource-jenkins-001", + "category": "resource", + "difficulty": "medium", + "audience": [ + "human", + "machine" + ], + "question": "What Kubernetes resource types does the jenkins chart generate? List each type and count.", + "answer": "ConfigMap: 5\nService: 3\nSecret: 3\nIngress: 2\nServiceAccount: 2\nPersistentVolumeClaim: 1\nSecurityGroupPolicy: 1\nPrometheusRule: 1\nBackendConfig: 1\nNetworkPolicy: 1\nPodDisruptionBudget: 1\nPodMonitoring: 1\nRoute: 1\nServiceMonitor: 1\nStatefulSet: 1\nRole: 1\nRoleBinding: 1\nSecretClaim: 1\nPod: 1", + "answerType": "list", + "source": "app-common/charts/jenkins/templates/" + }, + { + "id": "cross-sub-001", + "category": "cross-subsystem", + "difficulty": "hard", + "audience": [ + "human", + "machine" + ], + "question": "Which subsystems depend on app-tools, and how heavily (by call+import count)?", + "answer": "ipam-tools: 6\ncompute-tools: 4\napp-common: 1", + "answerType": "list", + "source": "dependency matrix" + }, + { + "id": "cross-sub-002", + "category": "cross-subsystem", + "difficulty": "easy", + "audience": [ + "human" + ], + "question": "Which subsystems are identified as cross-cutting concerns?", + "answer": "app-tools", + "answerType": "list", + "source": "subsystem aggregation" + }, + { + "id": "arch-001", + "category": "architectural", + "difficulty": "hard", + "audience": [ + "human" + ], + "question": "The following subsystems have 0 detected functions and 0 modules: account-common, network-common, network-core. Why might this be the case, and what do they actually contain?", + "answer": "These subsystems primarily contain Helm charts with Go-templated YAML, Terraform HCL, and Crossplane compositions. The code analysis pipeline detects functions/modules from Python, Go, TypeScript, and shell scripts — but Helm templates use Go template syntax ({{ }}) which doesn't produce traditional function/module entities. Their content is captured through the Helm chart extraction phase instead.", + "answerType": "explanation", + "source": "architectural analysis" + }, + { + "id": "arch-002", + "category": "architectural", + "difficulty": "easy", + "audience": [ + "human", + "machine" + ], + "question": "What is the current version and appVersion of the mdm-app Helm chart?", + "answer": "version: 0.7.4, appVersion: 2025.12-12-develop", + "answerType": "exact", + "source": "app-common/charts/mdm-app/Chart.yaml" + }, + { + "id": "scenario-001", + "category": "scenario", + "difficulty": "hard", + "audience": [ + "human" + ], + "question": "If you need to rotate the \"vault-secret\" shared secret, which Helm charts would be affected and need redeployment?", + "answer": "ai-app, cleanse-app, pms-app, rdm-app, shared-app", + "answerType": "list", + "source": "template interaction scanning" + }, + { + "id": "scenario-002", + "category": "scenario", + "difficulty": "hard", + "audience": [ + "human" + ], + "question": "A new engineer needs to deploy the MDM application stack. What charts, configuration values, and external dependencies should they understand first?", + "answer": "Chart: mdm-app (app-common/charts/mdm-app), Version: 0.7.4\nKey values: clusterName, environmentType, arch, cloudProvider, region, vaultEnv, imagePullPolicy, clusterDomain, terminationGracePeriodSeconds, etalonsubDomain (+28 more)\nResources generated: 22 K8s resources\nInteractions: none detected", + "answerType": "explanation", + "source": "app-common/charts/mdm-app" + }, + { + "id": "scenario-003", + "category": "scenario", + "difficulty": "medium", + "audience": [ + "human" + ], + "question": "What network ports does the Cassandra chart expose, and what are they typically used for?", + "answer": "Ports: 7000, 7001, 7199, 9042, 9160\n7000: Cassandra inter-node cluster communication\n7001: Cassandra SSL inter-node\n7199: JMX monitoring\n9042: CQL native transport (client connections)\n9160: Thrift client API (legacy)", + "answerType": "explanation", + "source": "app-common/apps-provider/addons/charts/state-cluster/cassandra/templates/" + } + ] +} \ No newline at end of file diff --git a/eval-report-v2.json b/eval-report-v2.json new file mode 100644 index 0000000..0facab7 --- /dev/null +++ b/eval-report-v2.json @@ -0,0 +1,594 @@ +{ + "timestamp": "2026-03-09T23:39:51.586Z", + "docsDir": "./foxtrot-docs", + "questionsFile": "./eval-questions.json", + "totalQuestions": 33, + "overallScore": 67.1, + "avgAccuracy": 2.79, + "avgCompleteness": 2.79, + "avgPrecision": 4.48, + "notFoundCount": 6, + "notFoundRate": "18.2%", + "byCategory": { + "structural": { + "avg": "31.5", + "count": 4 + }, + "dependency": { + "avg": "77.2", + "count": 5 + }, + "contract": { + "avg": "20.0", + "count": 1 + }, + "configuration": { + "avg": "93.3", + "count": 10 + }, + "interaction": { + "avg": "56.5", + "count": 2 + }, + "resource": { + "avg": "81.5", + "count": 4 + }, + "cross-subsystem": { + "avg": "20.0", + "count": 2 + }, + "architectural": { + "avg": "66.5", + "count": 2 + }, + "scenario": { + "avg": "44.3", + "count": 3 + } + }, + "byDifficulty": { + "easy": { + "avg": "75.5", + "count": 15 + }, + "medium": { + "avg": "67.0", + "count": 13 + }, + "hard": { + "avg": "41.2", + "count": 5 + } + }, + "results": [ + { + "id": "struct-001", + "category": "structural", + "difficulty": "easy", + "question": "How many Helm charts are defined across the Foxtrot monorepo?", + "groundTruth": "76", + "llmAnswer": "\n\n27", + "score": { + "accuracy": 0, + "completeness": 0, + "precision": 4, + "notes": "AI significantly undercounted Helm charts (27 vs 76). No hallucinated details, but the core answer is factually incorrect and captures only ~35% of actual charts." + }, + "composite": 27, + "notFound": false + }, + { + "id": "struct-002", + "category": "structural", + "difficulty": "medium", + "question": "Which 5 Helm charts produce the most Kubernetes resources? List them with their resource counts.", + "groundTruth": "cluster (app-common/apps-provider/addons/charts/common/cluster): 55\ncluster (app-common/charts/cluster): 32\njenkins (app-common/charts/jenkins): 29\nkube-state-metrics (app-common/charts/otel-collector/charts/kube-state-metrics): 27\nfoxtrot-eks (compute-common/templates/foxtrot-eks): 24", + "llmAnswer": "NOT_FOUND\n\nThe documentation does not provide explicit resource counts for the charts. While the \"Resources Generated\" sections list individual resources for each chart, they do not include a total count per chart. To accurately rank the 5 charts producing the most resources, I would need either:\n\n1. Explicit resource count totals in the documentation\n2. A summary table comparing resource counts across charts\n\nThe documentation lists resources by type and template file, but requires manual counting to determine totals, which is not a reliable method for documentation-based answers.", + "score": { + "accuracy": 0, + "completeness": 0, + "precision": 5, + "notes": "AI returned NOT_FOUND instead of the ranked list with specific resource counts. While it avoided hallucination, it failed to retrieve or provide the ground truth data that was available." + }, + "composite": 33, + "notFound": true + }, + { + "id": "struct-003", + "category": "structural", + "difficulty": "easy", + "question": "How many subsystems does the Foxtrot codebase contain?", + "groundTruth": "11", + "llmAnswer": "NOT_FOUND\n\nThe documentation provided contains only one explicit subsystem definition (the \"root\" subsystem). While the root subsystem's description references other domains (app, compute, ipam, runtime), the documentation does not provide a complete inventory or count of all subsystems in the Foxtrot codebase.", + "score": { + "accuracy": 0, + "completeness": 0, + "precision": 5, + "notes": "AI correctly avoided hallucination by stating NOT_FOUND rather than guessing, but failed to provide the ground truth answer of 11 subsystems. The response demonstrates appropriate epistemic humility but does not meet the requirement of answering the factual question." + }, + "composite": 33, + "notFound": true + }, + { + "id": "struct-004", + "category": "structural", + "difficulty": "easy", + "question": "Which subsystem contains the most files, and how many?", + "groundTruth": "runtime: 16309 files", + "llmAnswer": "ipam-core, 24", + "score": { + "accuracy": 0, + "completeness": 0, + "precision": 5, + "notes": "AI identified wrong subsystem (ipam-core vs runtime) and provided drastically incorrect file count (24 vs 16309). Answer format is correct but all factual content is wrong. No hallucinated extra information, just fundamentally incorrect core data." + }, + "composite": 33, + "notFound": false + }, + { + "id": "dep-external-dns-001", + "category": "dependency", + "difficulty": "medium", + "question": "What are the dependencies of the \"external-dns\" chart (at app-common/apps-provider/addons/charts/common/external-dns)?", + "groundTruth": "external-dns (1.17.0) [condition: external-dns.enabled]", + "llmAnswer": "\n\nexternal-dns (1.17.0)", + "score": { + "accuracy": 3, + "completeness": 3, + "precision": 5, + "notes": "AI correctly identified the dependency name and version but omitted the condition specification '[condition: external-dns.enabled]' which is part of the ground truth. The core information is accurate but incomplete." + }, + "composite": 73, + "notFound": false + }, + { + "id": "dep-ingress-nginx-001", + "category": "dependency", + "difficulty": "medium", + "question": "What are the dependencies of the \"ingress-nginx\" chart (at app-common/apps-provider/addons/charts/common/ingress-nginx)?", + "groundTruth": "ingress-nginx (4.12.0) [condition: ingress-nginx.enabled]", + "llmAnswer": "\n\ningress-nginx (4.12.0)", + "score": { + "accuracy": 3, + "completeness": 3, + "precision": 5, + "notes": "AI correctly identified the dependency name and version (ingress-nginx 4.12.0) but omitted the condition specification [condition: ingress-nginx.enabled] that is part of the complete ground truth answer." + }, + "composite": 73, + "notFound": false + }, + { + "id": "dep-kube-state-metrics-001", + "category": "dependency", + "difficulty": "medium", + "question": "What are the dependencies of the \"kube-state-metrics\" chart (at app-common/apps-provider/addons/charts/common/kube-state-metrics)?", + "groundTruth": "kube-state-metrics (5.26.0) [condition: kube-state-metrics.enabled]", + "llmAnswer": "\n\nkube-state-metrics (5.26.0)", + "score": { + "accuracy": 3, + "completeness": 3, + "precision": 5, + "notes": "AI correctly identified the dependency name and version (kube-state-metrics 5.26.0) but omitted the condition metadata [condition: kube-state-metrics.enabled] present in the ground truth. Core information is accurate with no hallucination, but the answer is incomplete." + }, + "composite": 73, + "notFound": false + }, + { + "id": "dep-open-telemetry-001", + "category": "dependency", + "difficulty": "medium", + "question": "What are the dependencies of the \"open-telemetry\" chart (at app-common/apps-provider/addons/charts/common/otel-collector)?", + "groundTruth": "opentelemetry-operator (0.26.4) [condition: opentelemetry-operator.enabled]\nk8s-metrics (0.1.0) [condition: k8s-metrics.enabled]\nagent (0.1.1) [condition: agent.enabled]\ncommon (0.1.0) [condition: common.enabled]\ngw-metrics (0.1.0) [condition: gw-metrics.enabled]\ngw-traces (0.1.0) [condition: gw-traces.enabled]\nprom-scraper (0.1.0) [condition: prom-scraper.enabled]", + "llmAnswer": "\n\nopentelemetry-operator (0.26.4)\nk8s-metrics (0.1.0)\nagent (0.1.1)\ncommon (0.1.0)\ngw-metrics (0.1.0)\ngw-traces (0.1.0)\nprom-scraper (0.1.0)", + "score": { + "accuracy": 4, + "completeness": 4, + "precision": 5, + "notes": "AI correctly identified all 7 dependencies with accurate versions, but omitted the conditional metadata ([condition: ...]) that was part of the ground truth specification. Core facts are accurate and complete, but the answer lacks full specification details." + }, + "composite": 87, + "notFound": false + }, + { + "id": "dep-reltio-prometheus-001", + "category": "dependency", + "difficulty": "medium", + "question": "What are the dependencies of the \"reltio-prometheus\" chart (at app-common/apps-provider/addons/charts/common/prometheus)?", + "groundTruth": "kube-prometheus-stack (69.4.1) [condition: kube-prometheus-stack.enabled]\nprometheus-adapter (4.11.0) [condition: prometheus-adapter.enabled]", + "llmAnswer": "\n\nkube-prometheus-stack (69.4.1)\nprometheus-adapter (4.11.0)", + "score": { + "accuracy": 4, + "completeness": 3, + "precision": 5, + "notes": "AI correctly identified both dependencies with accurate versions (69.4.1 and 4.11.0), but omitted the conditional clauses [condition: kube-prometheus-stack.enabled] and [condition: prometheus-adapter.enabled] that are part of the complete ground truth specification. The core facts are accurate but the answer lacks the full dependency specification." + }, + "composite": 80, + "notFound": false + }, + { + "id": "contract-001", + "category": "contract", + "difficulty": "medium", + "question": "Which secrets or ConfigMaps are shared across multiple Helm charts? List each with the charts that use it.", + "groundTruth": "logdna-agent-key: cluster\nlogdna-agent-tags: cluster\nvault-secret: ai-app, cleanse-app, pms-app, rdm-app, shared-app\nvault-secret-rdm-tasks: ai-app, cleanse-app, pms-app, rdm-app, shared-app", + "llmAnswer": "vault-secret\ncluster (app-common/apps-provider/addons/charts/common/cluster), argocd-apps, mdm-app, shared-app, ai-app, auth-app, cleanse-app, pms-app, rdm-app\n\nvault-component\nmdm-app, shared-app, ai-app, auth-app, cleanse-app, pms-app, rdm-app\n\ngcrregcred\ncluster (app-common/apps-provider/addons/charts/common/cluster), cluster (app-common/charts/cluster)\n\nlogdna\ncluster (app-common/apps-provider/addons/charts/common/cluster), cluster (app-common/charts/cluster)\n\nlogdna-agent\ncluster (app-common/apps-provider/addons/charts/common/cluster), cluster (app-common/charts/cluster)\n\nsentinelone\ncluster (app-common/apps-provider/addons/charts/common/cluster), cluster (app-common/charts/cluster)\n\ntwistlock-secrets\ntwistlock-defender, twistlock", + "score": { + "accuracy": 1, + "completeness": 1, + "precision": 1, + "notes": "AI found vault-secret but with incorrect chart list (includes cluster, argocd-apps, mdm-app, auth-app not in ground truth). Completely missed vault-secret-rdm-tasks, logdna-agent-key, and logdna-agent-tags. Introduced multiple hallucinated secrets (vault-component, gcrregcred, logdna, logdna-agent, sentinelone, twistlock-secrets) not in ground truth. Only 1 of 4 required items partially present with errors." + }, + "composite": 20, + "notFound": false + }, + { + "id": "config-mdm-app-domain", + "category": "configuration", + "difficulty": "easy", + "question": "What is the default value of \"domain\" in the mdm-app chart?", + "groundTruth": "cloud.reltio.com", + "llmAnswer": "cloud.reltio.com", + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "notes": "AI answer matches ground truth exactly. Direct, factual response with no hallucination or missing information." + }, + "composite": 100, + "notFound": false + }, + { + "id": "config-mdm-app-count", + "category": "configuration", + "difficulty": "easy", + "question": "How many top-level configuration keys does the mdm-app chart (at app-common/charts/mdm-app) expose in its values.yaml?", + "groundTruth": "38", + "llmAnswer": "38", + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "notes": "AI answer matches ground truth exactly. The response provides the correct count of 38 top-level configuration keys with no extraneous information or hallucination." + }, + "composite": 100, + "notFound": false + }, + { + "id": "config-ai-app-domain", + "category": "configuration", + "difficulty": "easy", + "question": "What is the default value of \"domain\" in the ai-app chart?", + "groundTruth": "cloud.reltio.com", + "llmAnswer": "cloud.reltio.com", + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "notes": "AI answer matches ground truth exactly. No hallucination, no missing information, direct and correct response." + }, + "composite": 100, + "notFound": false + }, + { + "id": "config-ai-app-count", + "category": "configuration", + "difficulty": "easy", + "question": "How many top-level configuration keys does the ai-app chart (at app-common/charts/ai-app) expose in its values.yaml?", + "groundTruth": "34", + "llmAnswer": "34", + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "notes": "AI answer matches ground truth exactly. The response provides the correct count of 34 top-level configuration keys with no extraneous information or hallucination." + }, + "composite": 100, + "notFound": false + }, + { + "id": "config-elasticsearch-domain", + "category": "configuration", + "difficulty": "easy", + "question": "What is the default value of \"domain\" in the elasticsearch chart?", + "groundTruth": "etalon.reltio.com", + "llmAnswer": "\n\netalon.reltio.com", + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "notes": "AI answer matches ground truth exactly with no extraneous information or hallucination" + }, + "composite": 100, + "notFound": false + }, + { + "id": "config-elasticsearch-count", + "category": "configuration", + "difficulty": "easy", + "question": "How many top-level configuration keys does the elasticsearch chart (at app-common/apps-provider/addons/charts/state-apps/es) expose in its values.yaml?", + "groundTruth": "32", + "llmAnswer": "32", + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "notes": "AI answer matches ground truth exactly. Provided the correct count of 32 top-level configuration keys with no extraneous information or hallucination." + }, + "composite": 100, + "notFound": false + }, + { + "id": "config-hazelcast-domain", + "category": "configuration", + "difficulty": "easy", + "question": "What is the default value of \"domain\" in the hazelcast chart?", + "groundTruth": "etalon.reltio.com", + "llmAnswer": "internal.reltio.com", + "score": { + "accuracy": 0, + "completeness": 0, + "precision": 5, + "notes": "AI provided incorrect value (internal.reltio.com instead of etalon.reltio.com). Answer is factually wrong but contains no hallucinated extra information." + }, + "composite": 33, + "notFound": false + }, + { + "id": "config-hazelcast-count", + "category": "configuration", + "difficulty": "easy", + "question": "How many top-level configuration keys does the hazelcast chart (at app-common/apps-provider/addons/charts/state-apps/hz) expose in its values.yaml?", + "groundTruth": "32", + "llmAnswer": "32", + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "notes": "AI answer matches ground truth exactly. Correct numerical response to a factual query with no extraneous information." + }, + "composite": 100, + "notFound": false + }, + { + "id": "config-redis-domain", + "category": "configuration", + "difficulty": "easy", + "question": "What is the default value of \"domain\" in the redis chart?", + "groundTruth": "internal.reltio.com", + "llmAnswer": "internal.reltio.com", + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "notes": "AI answer matches ground truth exactly. Direct, factual response with no hallucination or missing information." + }, + "composite": 100, + "notFound": false + }, + { + "id": "config-redis-count", + "category": "configuration", + "difficulty": "easy", + "question": "How many top-level configuration keys does the redis chart (at app-common/charts/redis) expose in its values.yaml?", + "groundTruth": "23", + "llmAnswer": "23", + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "notes": "AI answer matches ground truth exactly. Provided the correct count of 23 top-level configuration keys with no extraneous information or hallucination." + }, + "composite": 100, + "notFound": false + }, + { + "id": "interaction-001", + "category": "interaction", + "difficulty": "medium", + "question": "Which Helm charts reference Kubernetes services from other components? List each chart and the service it calls.", + "groundTruth": "hazelcast (app-common/apps-provider/addons/charts/state-apps/hz) → kubernetes\nhazelcast (app-common/charts/hazelcast) → kubernetes\ngw-metrics (app-common/charts/otel-collector/charts/gw-metrics) → argo-rollouts-metrics\nredis (app-common/charts/redis) → headless", + "llmAnswer": "hazelcast (app-common/charts/hazelcast) - kubernetes\nhazelcast (app-common/apps-provider/addons/charts/state-apps/hz) - kubernetes\nredis - headless", + "score": { + "accuracy": 3, + "completeness": 3, + "precision": 5, + "notes": "AI correctly identified 3 of 4 items with accurate paths and service references. Missing the gw-metrics chart entry (app-common/charts/otel-collector/charts/gw-metrics → argo-rollouts-metrics). No hallucinated information." + }, + "composite": 73, + "notFound": false + }, + { + "id": "interaction-002", + "category": "interaction", + "difficulty": "hard", + "question": "Which network ports are used by multiple Helm charts? List each port and the charts that expose it.", + "groundTruth": "Port 443: cluster, twistlock-defender, eck-operator, twistlock\nPort 8889: gw-metrics, gw-traces, kube-state-metrics\nPort 9200: eck-operator, elasticsearch\nPort 13133: agent, gw-metrics, gw-traces, k8s-metrics, prom-scraper, elasticsearch", + "llmAnswer": "Port 443\n- eck-operator\n- cluster\n- twistlock-defender\n- twistlock\n\nPort 8889\n- kube-state-metrics\n- gw-traces\n\nPort 9200\n- eck-operator\n- elasticsearch (app-common/apps-provider/addons/charts/state-apps/es)\n- elasticsearch (app-common/charts/elasticsearch)\n\nPort 13133\n- gw-traces\n- elasticsearch (app-common/apps-provider/addons/charts/state-apps/es)\n- elasticsearch (app-common/charts/elasticsearch)\n- k8s-metrics\n\nPort 0\n- elasticsearch (app-common/apps-provider/addons/charts/state-apps/es)\n- elasticsearch (app-common/charts/elasticsearch)\n- k8s-metrics", + "score": { + "accuracy": 2, + "completeness": 2, + "precision": 2, + "notes": "AI correctly identifies ports 443 and 9200 with accurate chart listings. However, it omits gw-metrics from port 8889, and critically misses agent, gw-metrics, and prom-scraper from port 13133. Additionally, it hallucinates Port 0 which does not appear in the ground truth. The answer is incomplete and contains factual errors in chart-to-port mappings." + }, + "composite": 40, + "notFound": false + }, + { + "id": "resource-001", + "category": "resource", + "difficulty": "medium", + "question": "What are the 10 most common Kubernetes resource types generated across all Foxtrot Helm charts?", + "groundTruth": "ClusterRole: 51\nServiceAccount: 49\nSecret: 43\nExternalSecret: 38\nService: 36\nRole: 30\nClusterRoleBinding: 29\nDeployment: 29\nConfigMap: 25\nRoleBinding: 25", + "llmAnswer": "NOT_FOUND\n\nThe documentation provided does not contain a comprehensive analysis or summary table that aggregates resource type counts across all Foxtrot Helm charts. While the documentation lists resources generated by individual charts, it does not provide the aggregated count needed to determine the 10 most common resource types across all Foxtrot charts.\n\nTo answer this question accurately, one would need to manually count occurrences across all documented charts and rank them, which is beyond what the provided documentation explicitly states.", + "score": { + "accuracy": 0, + "completeness": 0, + "precision": 5, + "notes": "AI answered NOT_FOUND instead of providing the ranked list of 10 resource types. While the response contains no hallucinated information, it failed to retrieve or provide any of the correct facts from the ground truth. The answer is factually incorrect and incomplete, though it avoided fabricating data." + }, + "composite": 33, + "notFound": true + }, + { + "id": "resource-mdm-app-001", + "category": "resource", + "difficulty": "medium", + "question": "What Kubernetes resource types does the mdm-app chart generate? List each type and count.", + "groundTruth": "Secret: 5\nJob: 4\nDeployment: 2\nPersistentVolumeClaim: 2\nService: 2\nExternalSecret: 2\nHorizontalPodAutoscaler: 1\nRollout: 1\nPersistentVolume: 1\nPodDisruptionBudget: 1\nServiceAccount: 1", + "llmAnswer": "\n\nHorizontalPodAutoscaler: 1\nDeployment: 2\nRollout: 1\nJob: 4\nPersistentVolume: 1\nPersistentVolumeClaim: 2\nService: 2\nPodDisruptionBudget: 1\nSecret: 5\nExternalSecret: 2\nServiceAccount: 1", + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "notes": "AI answer contains all 11 resource types with exact correct counts matching ground truth. Order differs but content is identical." + }, + "composite": 100, + "notFound": false + }, + { + "id": "resource-cassandra-001", + "category": "resource", + "difficulty": "medium", + "question": "What Kubernetes resource types does the cassandra chart generate? List each type and count.", + "groundTruth": "Service: 2\nStatefulSet: 1\nExternalSecret: 1\nClusterSecretStore: 1\nConfigMap: 1\nSecret: 1\nServiceAccount: 1\nRole: 1\nRoleBinding: 1\nClusterRole: 1\nClusterRoleBinding: 1\nCronJob: 1\nJob: 1", + "llmAnswer": "Service: 2\nStatefulSet: 1\nExternalSecret: 1\nClusterSecretStore: 1\nConfigMap: 1\nSecret: 1\nServiceAccount: 1\nRole: 1\nRoleBinding: 1\nClusterRole: 1\nClusterRoleBinding: 1\nCronJob: 1\nJob: 1", + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "notes": "AI answer matches ground truth exactly. All 13 resource types listed with correct counts. No omissions or hallucinations." + }, + "composite": 100, + "notFound": false + }, + { + "id": "resource-jenkins-001", + "category": "resource", + "difficulty": "medium", + "question": "What Kubernetes resource types does the jenkins chart generate? List each type and count.", + "groundTruth": "ConfigMap: 5\nService: 3\nSecret: 3\nIngress: 2\nServiceAccount: 2\nPersistentVolumeClaim: 1\nSecurityGroupPolicy: 1\nPrometheusRule: 1\nBackendConfig: 1\nNetworkPolicy: 1\nPodDisruptionBudget: 1\nPodMonitoring: 1\nRoute: 1\nServiceMonitor: 1\nStatefulSet: 1\nRole: 1\nRoleBinding: 1\nSecretClaim: 1\nPod: 1", + "llmAnswer": "BackendConfig: 1\nConfigMap: 4\nIngress: 2\nNetworkPolicy: 1\nPod: 1\nPodDisruptionBudget: 1\nPodMonitoring: 1\nPersistentVolumeClaim: 1\nPrometheusRule: 1\nRole: 1\nRoleBinding: 1\nRoute: 1\nSecret: 3\nSecretClaim: 1\nService: 3\nServiceAccount: 2\nServiceMonitor: 1\nSecurityGroupPolicy: 1\nStatefulSet: 1", + "score": { + "accuracy": 4, + "completeness": 5, + "precision": 5, + "notes": "AI lists all 19 resource types correctly but miscounts ConfigMap as 4 instead of 5. All other counts and types match ground truth exactly." + }, + "composite": 93, + "notFound": false + }, + { + "id": "cross-sub-001", + "category": "cross-subsystem", + "difficulty": "hard", + "question": "Which subsystems depend on app-tools, and how heavily (by call+import count)?", + "groundTruth": "ipam-tools: 6\ncompute-tools: 4\napp-common: 1", + "llmAnswer": "\n\nNOT_FOUND\n\nThe provided documentation does not contain information about which subsystems depend on app-tools or their dependency metrics (call+import counts). While the app-tools subsystem documentation describes its public exports and internal structure, there is no cross-reference documentation showing downstream dependencies or usage patterns from other subsystems.", + "score": { + "accuracy": 0, + "completeness": 0, + "precision": 5, + "notes": "AI correctly avoided hallucination by stating information unavailable, but failed to locate or provide the ground truth data showing three subsystems with specific dependency counts (ipam-tools: 6, compute-tools: 4, app-common: 1)." + }, + "composite": 33, + "notFound": true + }, + { + "id": "cross-sub-002", + "category": "cross-subsystem", + "difficulty": "easy", + "question": "Which subsystems are identified as cross-cutting concerns?", + "groundTruth": "app-tools", + "llmAnswer": "root", + "score": { + "accuracy": 0, + "completeness": 0, + "precision": 1, + "notes": "AI answered 'root' instead of the correct answer 'app-tools'. The response is factually incorrect and fails to identify the actual cross-cutting concern subsystem specified in the ground truth." + }, + "composite": 7, + "notFound": false + }, + { + "id": "arch-001", + "category": "architectural", + "difficulty": "hard", + "question": "The following subsystems have 0 detected functions and 0 modules: account-common, network-common, network-core. Why might this be the case, and what do they actually contain?", + "groundTruth": "These subsystems primarily contain Helm charts with Go-templated YAML, Terraform HCL, and Crossplane compositions. The code analysis pipeline detects functions/modules from Python, Go, TypeScript, and shell scripts — but Helm templates use Go template syntax ({{ }}) which doesn't produce traditional function/module entities. Their content is captured through the Helm chart extraction phase instead.", + "llmAnswer": "NOT_FOUND\n\nThe documentation provided does not contain subsystem reference pages for account-common, network-common, or network-core. There is only a single reference to account-common in the helm charts section, which shows it contains a Helm template for account creation, but this does not explain why these subsystems register 0 functions and 0 modules, nor does it describe their complete contents.", + "score": { + "accuracy": 0, + "completeness": 0, + "precision": 5, + "notes": "AI responded with 'NOT_FOUND' rather than providing the ground truth explanation. The correct answer requires understanding that Helm templates with Go template syntax ({{ }}) don't produce detectable functions/modules in traditional code analysis pipelines, and that these subsystems' content is captured through Helm chart extraction instead. AI correctly avoided hallucination but failed to locate or infer the actual explanation." + }, + "composite": 33, + "notFound": true + }, + { + "id": "arch-002", + "category": "architectural", + "difficulty": "easy", + "question": "What is the current version and appVersion of the mdm-app Helm chart?", + "groundTruth": "version: 0.7.4, appVersion: 2025.12-12-develop", + "llmAnswer": "Version: 0.7.4\nApp Version: 2025.12-12-develop", + "score": { + "accuracy": 5, + "completeness": 5, + "precision": 5, + "notes": "AI answer contains both required values with exact matches to ground truth. No hallucination or missing information." + }, + "composite": 100, + "notFound": false + }, + { + "id": "scenario-001", + "category": "scenario", + "difficulty": "hard", + "question": "If you need to rotate the \"vault-secret\" shared secret, which Helm charts would be affected and need redeployment?", + "groundTruth": "ai-app, cleanse-app, pms-app, rdm-app, shared-app", + "llmAnswer": "\n\nshared-app\nai-app\ncleanse-app\npms-app\nargocd-apps", + "score": { + "accuracy": 3, + "completeness": 3, + "precision": 3, + "notes": "AI answer includes 4 of 5 correct charts (shared-app, ai-app, cleanse-app, pms-app) but misses rdm-app and incorrectly includes argocd-apps, which is not affected by vault-secret rotation." + }, + "composite": 60, + "notFound": false + }, + { + "id": "scenario-002", + "category": "scenario", + "difficulty": "hard", + "question": "A new engineer needs to deploy the MDM application stack. What charts, configuration values, and external dependencies should they understand first?", + "groundTruth": "Chart: mdm-app (app-common/charts/mdm-app), Version: 0.7.4\nKey values: clusterName, environmentType, arch, cloudProvider, region, vaultEnv, imagePullPolicy, clusterDomain, terminationGracePeriodSeconds, etalonsubDomain (+28 more)\nResources generated: 22 K8s resources\nInteractions: none detected", + "llmAnswer": "\n\nA new engineer deploying the MDM application stack should first understand:\n\n**Core Charts**\nThe deployment orchestrates multiple Helm charts: `mdm-app` (the primary application), `cluster` (cluster-level infrastructure and security agents), `argocd-apps` (GitOps deployment automation), `secret-operator` (External Secrets Operator for credential management), and supporting application charts (`shared-app`, `rdm-app`, `auth-app`, `cleanse-app`, `pms-app`, `ai-app`). Infrastructure charts like `milvus` (vector database), `opentelemetry-operator` (observability), and `external-dns` (DNS automation) provide foundational services.\n\n**Critical Configuration Values**\nCluster identity (`clusterName`, `clusterDomain`, `region`, `cloudProvider`), domain routing (`domain`, `rootDomain`, `etalonsubDomain`), vault integration (`vaultEnv`, vault secret stores), image registry access (`baseRegistryURL`, `imagePullSecrets`, `gcrregcred`), and feature flags (`deployIstio`, `deployLogdna`, `deploySentinelOne`, `deployPrisma`, `efsEnabled`). Storage configuration (`persistent` storage classes, `efsid` for EFS) and workload placement (`nodeSelector`, `tolerations`) determine resource allocation.\n\n**External Dependencies**\nHashiCorp Vault for secrets, External Secrets Operator for credential injection, Istio for service mesh and TLS, LogDNA for centralized logging, SentinelOne and Prisma for security agents, Milvus as a vector database backend, OpenTelemetry for distributed tracing and metrics, cert-manager for certificate lifecycle, ArgoCD for GitOps-driven deployments, and kube-state-metrics for Kubernetes cluster monitoring. Each dependency requires specific configuration values and external secret references to function.", + "score": { + "accuracy": 2, + "completeness": 2, + "precision": 2, + "notes": "AI identifies mdm-app and mentions several correct configuration values (clusterName, clusterDomain, region, cloudProvider, vaultEnv, etalonsubDomain) and dependencies (Vault, External Secrets Operator, Istio). However, it omits critical ground truth specifics: chart version 0.7.4, the 22 K8s resources generated, and the 'none detected' interactions note. The answer significantly expands scope with unverified charts (cluster, argocd-apps, secret-operator, milvus, opentelemetry-operator, external-dns) and configuration values not in ground truth, creating ambiguity about what's essential versus supplementary. Fails to clearly establish mdm-app as the primary chart requiring focus." + }, + "composite": 40, + "notFound": false + }, + { + "id": "scenario-003", + "category": "scenario", + "difficulty": "medium", + "question": "What network ports does the Cassandra chart expose, and what are they typically used for?", + "groundTruth": "Ports: 7000, 7001, 7199, 9042, 9160\n7000: Cassandra inter-node cluster communication\n7001: Cassandra SSL inter-node\n7199: JMX monitoring\n9042: CQL native transport (client connections)\n9160: Thrift client API (legacy)", + "llmAnswer": "NOT_FOUND\n\nThe documentation lists the Cassandra chart exposes five ports (7000, 7001, 7199, 9042, and 9160) but does not provide information about what these ports are typically used for.", + "score": { + "accuracy": 0, + "completeness": 0, + "precision": 5, + "notes": "AI correctly identified it could not find port usage information in documentation. No hallucination, but no answer provided to the question." + }, + "composite": 33, + "notFound": true + } + ] +} \ No newline at end of file diff --git a/ratchet-history.json b/ratchet-history.json new file mode 100644 index 0000000..6b73010 --- /dev/null +++ b/ratchet-history.json @@ -0,0 +1,14 @@ +{ + "timestamp": "2026-03-09T23:29:11.565Z", + "target": 80, + "history": [ + { + "iteration": 1, + "score": 66.9, + "questions": 33, + "notFoundRate": "18.2%" + } + ], + "finalScore": 66.9, + "passed": false +} \ No newline at end of file diff --git a/ratchet.js b/ratchet.js new file mode 100644 index 0000000..4f1f771 --- /dev/null +++ b/ratchet.js @@ -0,0 +1,396 @@ +/** + * Quality Ratchet: Automated Doc Improvement Loop + * + * BMad-inspired workflow: + * 1. EVAL: Run question bank against docs, get scores + * 2. DIAGNOSE: Identify weakest categories and specific failures + * 3. FIX: LLM proposes concrete code changes to the doc generator + * 4. APPLY: Apply fixes, regenerate docs + * 5. RE-EVAL: Score again + * 6. RATCHET: If passing threshold, add harder questions + * 7. REPEAT until target score or max iterations + * + * Usage: node ratchet.js [--target=80] [--max-iter=5] + */ + +const fs = require('fs'); +const path = require('path'); +const { execSync } = require('child_process'); +const { callLLM } = require('./prose.js'); +const { runEval } = require('./eval.js'); +const { generateQuestions } = require('./eval-questions.js'); + +const PROJECT_DIR = __dirname; + +function parseArgs() { + const args = process.argv.slice(2); + const opts = { + srcRoot: args[0], + snapshot: args[1], + docsDir: args[2], + target: 80, + maxIter: 5, + model: process.env.LLM_MODEL || 'claude-haiku-4.5', + fixModel: process.env.FIX_MODEL || 'claude-sonnet-4.6', + }; + for (const a of args) { + if (a.startsWith('--target=')) opts.target = Number(a.split('=')[1]); + if (a.startsWith('--max-iter=')) opts.maxIter = Number(a.split('=')[1]); + } + return opts; +} + +/** Diagnose failures and produce a fix plan */ +async function diagnose(report, opts) { + // Collect the worst-performing questions with full context + const failures = report.results + .filter(r => r.composite < 60) + .sort((a, b) => a.composite - b.composite) + .slice(0, 10); + + if (failures.length === 0) return { fixes: [], diagnosis: 'All questions above 60%. No critical failures.' }; + + // Read current sysdoc.js for context + const sysdocSource = fs.readFileSync(path.join(PROJECT_DIR, 'sysdoc.js'), 'utf8'); + + // Read extract-helm.js + const helmSource = fs.readFileSync(path.join(PROJECT_DIR, 'extract-helm.js'), 'utf8'); + + const failureDetails = failures.map(f => + `[${f.id}] Score: ${f.composite}% (A:${f.score.accuracy} C:${f.score.completeness} P:${f.score.precision}) + Q: ${f.question} + Expected: ${f.groundTruth.substring(0, 300)} + Got: ${f.llmAnswer.substring(0, 300)} + Judge notes: ${f.score.notes} + NOT_FOUND: ${f.notFound}` + ).join('\n\n'); + + const prompt = `You are a documentation pipeline engineer. Analyze these evaluation failures and propose CONCRETE fixes to the doc generator code. + +## Current Pipeline +The doc generator (sysdoc.js) produces Markdown documentation from: +1. Code analysis graph (subsystems, functions, modules) +2. Helm chart extraction (Chart.yaml, values.yaml, templates) +3. LLM prose generation + +## Failures (sorted worst-first) +${failureDetails} + +## Score Summary +Overall: ${report.overallScore}% +By category: ${JSON.stringify(report.byCategory)} +NOT_FOUND rate: ${report.notFoundRate} + +## Key Source Files +sysdoc.js generates the docs. extract-helm.js extracts Helm data. + +## Rules +1. Each fix must be a SPECIFIC change to sysdoc.js or extract-helm.js +2. Focus on information that IS extracted but NOT surfaced in the docs +3. If data is missing from extraction, propose extraction improvements +4. Prioritize fixes that improve multiple questions at once +5. Do NOT propose changes to the eval harness or questions + +Respond in this JSON format: +{ + "diagnosis": "2-3 sentence summary of root causes", + "fixes": [ + { + "id": "fix-001", + "file": "sysdoc.js", + "description": "What to change and why", + "impact": ["question-id-1", "question-id-2"], + "expectedImprovement": "+15%", + "code_hint": "Brief description of the code change needed" + } + ] +}`; + + const raw = await callLLM(prompt, { model: opts.fixModel, maxTokens: 2048, temperature: 0.2 }); + + try { + const jsonMatch = raw.match(/\{[\s\S]*\}/); + if (jsonMatch) return JSON.parse(jsonMatch[0]); + } catch {} + + return { diagnosis: raw.substring(0, 500), fixes: [] }; +} + +/** Apply a fix by having the LLM generate the actual code change */ +async function applyFix(fix, opts) { + const filePath = path.join(PROJECT_DIR, fix.file); + const source = fs.readFileSync(filePath, 'utf8'); + + const prompt = `You are modifying ${fix.file} to improve documentation quality. + +CHANGE NEEDED: ${fix.description} +CODE HINT: ${fix.code_hint} + +CURRENT SOURCE (${fix.file}): +${source} + +Generate ONLY the specific code change. Output in this format: +SEARCH: +\`\`\` +exact lines to find +\`\`\` +REPLACE: +\`\`\` +replacement lines +\`\`\` + +If multiple changes are needed, repeat the SEARCH/REPLACE blocks. +Be precise — the SEARCH text must match the source exactly.`; + + const raw = await callLLM(prompt, { model: opts.fixModel, maxTokens: 4096, temperature: 0.1 }); + + // Parse SEARCH/REPLACE blocks + const changes = []; + const blockRegex = /SEARCH:\s*```[^\n]*\n([\s\S]*?)```\s*REPLACE:\s*```[^\n]*\n([\s\S]*?)```/g; + let match; + while ((match = blockRegex.exec(raw)) !== null) { + changes.push({ search: match[1].trimEnd(), replace: match[2].trimEnd() }); + } + + if (changes.length === 0) { + console.log(` ⚠ No parseable changes from LLM for fix ${fix.id}`); + return false; + } + + // Apply changes + let modified = source; + let applied = 0; + for (const change of changes) { + if (modified.includes(change.search)) { + modified = modified.replace(change.search, change.replace); + applied++; + } else { + console.log(` ⚠ SEARCH block not found in ${fix.file} for fix ${fix.id}`); + } + } + + if (applied > 0) { + fs.writeFileSync(filePath, modified); + console.log(` ✓ Applied ${applied}/${changes.length} changes to ${fix.file}`); + return true; + } + return false; +} + +/** Regenerate docs */ +function regenerateDocs(opts) { + console.log(' Regenerating docs...'); + execSync(`rm -rf ${opts.docsDir}`, { cwd: PROJECT_DIR }); + const cmd = `LLM_MODEL=${opts.model} node sysdoc.js ${opts.snapshot} ${opts.srcRoot} ${opts.docsDir} --prose`; + execSync(cmd, { cwd: PROJECT_DIR, timeout: 600000, stdio: 'pipe' }); +} + +/** Generate harder questions based on current performance */ +function ratchetQuestions(currentQuestions, report, srcRoot, snapshotPath) { + // Find categories scoring > 90% — make them harder + const easyCategories = Object.entries(report.byCategory) + .filter(([, s]) => Number(s.avg) > 90) + .map(([cat]) => cat); + + if (easyCategories.length === 0) return currentQuestions; + + console.log(` Ratcheting: categories scoring >90%: ${easyCategories.join(', ')}`); + + // Add more specific questions for high-scoring categories + const newQuestions = [...currentQuestions]; + const { discoverCharts } = require('./extract-helm.js'); + const charts = discoverCharts(srcRoot, new Set(['node_modules', '.git', 'venv', '__pycache__', '.terraform', '_bmad', '_bmad-output', '.codex', '.claude'])); + + if (easyCategories.includes('configuration')) { + // Add nested value questions (harder than top-level) + for (const chart of charts.filter(c => c.values.keys.length > 20).slice(0, 3)) { + const objectKeys = chart.values.keys.filter(k => k.type === 'object'); + if (objectKeys.length > 0) { + newQuestions.push({ + id: `ratchet-config-${chart.chart.name}-nested`, + category: 'configuration', + difficulty: 'hard', + audience: ['human', 'machine'], + question: `In the ${chart.chart.name} chart, which configuration keys are complex objects (not simple values)? List them.`, + answer: objectKeys.map(k => k.name).join(', '), + answerType: 'list', + source: `${chart.dir}/values.yaml`, + }); + } + } + } + + if (easyCategories.includes('resource')) { + // Add cross-chart resource comparison questions + const deployers = charts.filter(c => c.templates.resources.some(r => r.kind === 'Deployment')); + if (deployers.length > 0) { + newQuestions.push({ + id: 'ratchet-resource-deployments', + category: 'resource', + difficulty: 'hard', + audience: ['human', 'machine'], + question: 'Which Helm charts create Kubernetes Deployments? List all of them.', + answer: deployers.map(c => `${c.chart.name} (${c.dir})`).join('\n'), + answerType: 'list', + source: 'template scanning', + }); + } + } + + if (easyCategories.includes('dependency')) { + // Add transitive dependency questions + const withDeps = charts.filter(c => c.chart.dependencies.length > 2); + for (const chart of withDeps.slice(0, 2)) { + newQuestions.push({ + id: `ratchet-dep-${chart.chart.name}-conditions`, + category: 'dependency', + difficulty: 'hard', + audience: ['machine'], + question: `What are the enable conditions for each dependency of the "${chart.chart.name}" chart?`, + answer: chart.chart.dependencies.map(d => `${d.name}: ${d.condition || 'always enabled'}`).join('\n'), + answerType: 'list', + source: `${chart.dir}/Chart.yaml`, + }); + } + } + + const added = newQuestions.length - currentQuestions.length; + if (added > 0) console.log(` Added ${added} harder questions`); + return newQuestions; +} + +/** Main ratchet loop */ +async function ratchetLoop(opts) { + console.log('═'.repeat(60)); + console.log('QUALITY RATCHET — BMad Improvement Loop'); + console.log('═'.repeat(60)); + console.log(`Target: ${opts.target}% Max iterations: ${opts.maxIter}`); + console.log(`Eval model: ${opts.model} Fix model: ${opts.fixModel}`); + console.log(); + + // Load initial questions + const questionsPath = path.join(PROJECT_DIR, 'eval-questions.json'); + let questionsData = JSON.parse(fs.readFileSync(questionsPath, 'utf8')); + let questions = questionsData.questions; + + const history = []; + + for (let iter = 1; iter <= opts.maxIter; iter++) { + console.log(`\n${'─'.repeat(60)}`); + console.log(`ITERATION ${iter}/${opts.maxIter}`); + console.log('─'.repeat(60)); + + // Write current questions + const iterQuestionsPath = path.join(PROJECT_DIR, `eval-questions-iter${iter}.json`); + fs.writeFileSync(iterQuestionsPath, JSON.stringify({ generated: new Date().toISOString(), count: questions.length, questions }, null, 2)); + + // Step 1: Eval + console.log('\n📊 EVAL'); + const report = await runEval(opts.docsDir, iterQuestionsPath, { model: opts.model }); + + console.log(` Score: ${report.overallScore}% (A:${report.avgAccuracy} C:${report.avgCompleteness} P:${report.avgPrecision})`); + console.log(` NOT_FOUND: ${report.notFoundRate}`); + + history.push({ + iteration: iter, + score: report.overallScore, + questions: questions.length, + notFoundRate: report.notFoundRate, + }); + + // Check if we've hit the target + if (report.overallScore >= opts.target) { + console.log(`\n🎯 TARGET REACHED: ${report.overallScore}% >= ${opts.target}%`); + + // Ratchet: make it harder + const harderQuestions = ratchetQuestions(questions, report, opts.srcRoot, opts.snapshot); + if (harderQuestions.length > questions.length) { + console.log(` Ratcheting up: ${questions.length} → ${harderQuestions.length} questions`); + questions = harderQuestions; + // Don't break — run another iteration with harder questions + continue; + } else { + console.log(' No harder questions to add. Pipeline is solid.'); + break; + } + } + + // Step 2: Diagnose + console.log('\n🔍 DIAGNOSE'); + const diagnosis = await diagnose(report, opts); + console.log(` ${diagnosis.diagnosis}`); + console.log(` Proposed fixes: ${diagnosis.fixes.length}`); + + if (diagnosis.fixes.length === 0) { + console.log(' No actionable fixes proposed. Stopping.'); + break; + } + + // Step 3: Apply fixes + console.log('\n🔧 FIX'); + let anyApplied = false; + for (const fix of diagnosis.fixes.slice(0, 3)) { // Max 3 fixes per iteration + console.log(` [${fix.id}] ${fix.description}`); + console.log(` Impact: ${fix.impact.join(', ')} | Expected: ${fix.expectedImprovement}`); + const applied = await applyFix(fix, opts); + if (applied) anyApplied = true; + } + + if (!anyApplied) { + console.log(' No fixes could be applied. Stopping.'); + break; + } + + // Step 4: Regenerate docs + console.log('\n📝 REGENERATE'); + try { + regenerateDocs(opts); + } catch (err) { + console.log(` ⚠ Doc generation failed: ${err.message}`); + console.log(' Reverting changes...'); + execSync(`git checkout -- sysdoc.js extract-helm.js`, { cwd: PROJECT_DIR }); + break; + } + } + + // Final summary + console.log('\n' + '═'.repeat(60)); + console.log('RATCHET SUMMARY'); + console.log('═'.repeat(60)); + for (const h of history) { + console.log(` Iter ${h.iteration}: ${h.score}% (${h.questions} questions, NOT_FOUND: ${h.notFoundRate})`); + } + + const finalScore = history[history.length - 1]?.score || 0; + console.log(`\nFinal: ${finalScore}% ${finalScore >= opts.target ? '✅ PASS' : '❌ BELOW TARGET'}`); + + // Save history + const historyPath = path.join(PROJECT_DIR, 'ratchet-history.json'); + fs.writeFileSync(historyPath, JSON.stringify({ + timestamp: new Date().toISOString(), + target: opts.target, + history, + finalScore, + passed: finalScore >= opts.target, + }, null, 2)); + console.log(`History: ${historyPath}`); + + return { history, finalScore, passed: finalScore >= opts.target }; +} + +if (require.main === module) { + const opts = parseArgs(); + if (!opts.srcRoot || !opts.snapshot || !opts.docsDir) { + console.error('Usage: node ratchet.js [--target=80] [--max-iter=5]'); + process.exit(1); + } + + ratchetLoop(opts).then(result => { + process.exit(result.passed ? 0 : 1); + }).catch(err => { + console.error('Ratchet failed:', err); + process.exit(1); + }); +} + +module.exports = { ratchetLoop }; diff --git a/sysdoc.js b/sysdoc.js index cf5aa7a..65f1f30 100644 --- a/sysdoc.js +++ b/sysdoc.js @@ -103,11 +103,38 @@ async function generateDocs(graph, srcRoot, outDir, opts = {}) { const sysArchContent = `# System Architecture ${archProse} +## Summary Statistics +- **Subsystems:** ${subs.subsystems.length} +- **Helm Charts:** ${helmCharts.length} +- **Total Contracts:** ${contractsResult.contracts.length} +- **Cross-Cutting Concerns:** ${subs.crossCutting.join(', ') || 'none'} + ## Subsystems -${subs.subsystems.map(s => `- **${s.name}** (${s.kind}): ${s.entities.modules} modules, ${s.entities.functions} functions`).join('\n')} + +| Subsystem | Kind | Files | Modules | Functions | +|---|---|---|---|---| +${subs.subsystems.map(s => `| ${s.name} | ${s.kind} | ${s.files.length} | ${s.entities.modules} | ${s.entities.functions} |`).join('\n')} ## Cross-Cutting Concerns -${subs.crossCutting.map(c => `- **${c}**`).join('\n')} +${subs.crossCutting.map(c => `- **${c}**`).join('\n') || '*None detected*'} + +## Cross-Subsystem Dependencies + +| From | To | Calls | Imports | +|---|---|---|---| +${Object.entries(subs.dependencyMatrix).filter(([, v]) => (v.calls + v.imports) > 0).sort((a, b) => (b[1].calls + b[1].imports) - (a[1].calls + a[1].imports)).map(([k, v]) => { const [from, to] = k.split('→'); return `| ${from} | ${to} | ${v.calls} | ${v.imports} |`; }).join('\n')} + +## Top Helm Charts by Resource Count + +| Chart | Path | Resources | +|---|---|---| +${[...helmCharts].sort((a, b) => b.templates.resources.length - a.templates.resources.length).slice(0, 10).map(c => `| ${c.chart.name} | \`${c.dir}\` | ${c.templates.resources.length} |`).join('\n')} + +## Kubernetes Resource Types (across all charts) + +| Kind | Count | +|---|---| +${(() => { const kinds = {}; for (const c of helmCharts) for (const r of c.templates.resources) kinds[r.kind] = (kinds[r.kind] || 0) + 1; return Object.entries(kinds).sort((a, b) => b[1] - a[1]).slice(0, 15).map(([k, v]) => `| ${k} | ${v} |`).join('\n'); })()} ## Dependency Map \`\`\`mermaid