Skip to content

Commit 37343ac

Browse files
committed
docs: fix metrics doc generator to support opmetrics/pmetrics constructors and regenerate all versioned metrics docs
1 parent e8a4d26 commit 37343ac

5 files changed

Lines changed: 1663 additions & 179 deletions

File tree

hack/docs/metrics_gen/main.go

Lines changed: 179 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,18 @@ var (
4545
stableMetrics = []string{"controller_runtime", "aws_sdk_go", "client_go", "leader_election", "interruption", "cluster_state", "workqueue", "karpenter_build_info", "karpenter_nodepool_usage", "karpenter_nodepool_limit",
4646
"karpenter_nodeclaims_terminated_total", "karpenter_nodeclaims_created_total", "karpenter_nodes_terminated_total", "karpenter_nodes_created_total", "karpenter_pods_startup_duration_seconds",
4747
"karpenter_scheduler_scheduling_duration_seconds", "karpenter_provisioner_scheduling_duration_seconds", "karpenter_nodepool_allowed_disruptions", "karpenter_voluntary_disruption_decisions_total"}
48-
betaMetrics = []string{"status_condition", "cloudprovider", "cloudprovider_batcher", "karpenter_nodeclaims_termination_duration_seconds", "karpenter_nodeclaims_instance_termination_duration_seconds",
48+
betaMetrics = []string{"cloudprovider", "cloudprovider_batcher", "karpenter_nodeclaims_termination_duration_seconds", "karpenter_nodeclaims_instance_termination_duration_seconds",
4949
"karpenter_nodes_total_pod_requests", "karpenter_nodes_total_pod_limits", "karpenter_nodes_total_daemon_requests", "karpenter_nodes_total_daemon_limits", "karpenter_nodes_termination_duration_seconds",
5050
"karpenter_nodes_system_overhead", "karpenter_nodes_allocatable", "karpenter_pods_state", "karpenter_scheduler_queue_depth", "karpenter_voluntary_disruption_queue_failures_total",
51-
"karpenter_voluntary_disruption_decision_evaluation_duration_seconds", "karpenter_voluntary_disruption_eligible_nodes", "karpenter_voluntary_disruption_consolidation_timeouts_total"}
51+
"karpenter_voluntary_disruption_decision_evaluation_duration_seconds", "karpenter_voluntary_disruption_eligible_nodes", "karpenter_voluntary_disruption_consolidation_timeouts_total",
52+
// Per-object status condition and termination metrics from operatorpkg
53+
"nodeclaim_status_condition", "nodeclaim_termination",
54+
"node_status_condition", "node_termination",
55+
"nodepool_status_condition", "nodepool_termination",
56+
"ec2nodeclass_status_condition", "ec2nodeclass_termination"}
57+
// Deprecated generic status condition and termination metrics (without object name prefix).
58+
// These are still emitted at runtime but are superseded by per-object variants.
59+
deprecatedMetrics = []string{"status_condition", "termination"}
5260
)
5361

5462
func (i metricInfo) qualifiedName() string {
@@ -69,6 +77,12 @@ func main() {
6977
allMetrics = append(allMetrics, getMetricsFromPackages(packages...)...)
7078
}
7179

80+
// The operatorpkg status and events controllers dynamically create per-object metrics
81+
// at runtime based on the Go type parameter passed to status.NewController[T]().
82+
// These cannot be extracted via AST parsing, so we generate them from a known list of
83+
// object types that have status controllers registered.
84+
allMetrics = append(allMetrics, perObjectStatusMetrics()...)
85+
7286
// Dedupe metrics
7387
allMetrics = lo.UniqBy(allMetrics, func(m metricInfo) string {
7488
return fmt.Sprintf("%s/%s/%s", m.namespace, m.subsystem, m.name)
@@ -93,6 +107,15 @@ func main() {
93107
}
94108
sort.Slice(allMetrics, bySubsystem(allMetrics))
95109

110+
// Sanity check: fail loudly if the metric count drops below expected.
111+
// This catches silent regressions where new identifier mappings are needed
112+
// or metric declaration patterns change. Update this threshold when metrics
113+
// are intentionally removed.
114+
const minExpectedMetrics = 100
115+
if len(allMetrics) < minExpectedMetrics {
116+
log.Fatalf("expected at least %d metrics but only found %d; the generator may be silently dropping metrics due to unrecognized identifiers or new declaration patterns", minExpectedMetrics, len(allMetrics))
117+
}
118+
96119
outputFileName := flag.Arg(flag.NArg() - 1)
97120
f, err := os.Create(outputFileName)
98121
if err != nil {
@@ -132,6 +155,8 @@ description: >
132155
fmt.Fprintf(f, "### `%s`\n", metric.qualifiedName())
133156
fmt.Fprintf(f, "%s\n", metric.help)
134157
switch {
158+
case slices.Contains(deprecatedMetrics, metric.subsystem) || slices.Contains(deprecatedMetrics, metric.qualifiedName()):
159+
fmt.Fprintf(f, "- Stability Level: %s\n", "DEPRECATED")
135160
case slices.Contains(stableMetrics, metric.subsystem) || slices.Contains(stableMetrics, metric.qualifiedName()):
136161
fmt.Fprintf(f, "- Stability Level: %s\n", "STABLE")
137162
case slices.Contains(betaMetrics, metric.subsystem) || slices.Contains(betaMetrics, metric.qualifiedName()):
@@ -159,7 +184,7 @@ func getPackages(root string) []*ast.Package {
159184
}
160185
// parse the packagers that we find
161186
pkgs, err := parser.ParseDir(fset, path, func(info fs.FileInfo) bool {
162-
return true
187+
return !strings.HasSuffix(info.Name(), "_test.go")
163188
}, parser.AllErrors)
164189
if err != nil {
165190
log.Fatalf("error parsing, %s", err)
@@ -176,22 +201,19 @@ func getPackages(root string) []*ast.Package {
176201
}
177202

178203
func getMetricsFromPackages(packages ...*ast.Package) []metricInfo {
179-
// metrics are all package global variables
180204
var allMetrics []metricInfo
181205
for _, pkg := range packages {
182206
for _, file := range pkg.Files {
183-
for _, decl := range file.Decls {
184-
switch v := decl.(type) {
185-
case *ast.FuncDecl:
186-
// ignore
187-
case *ast.GenDecl:
188-
if v.Tok == token.VAR {
189-
allMetrics = append(allMetrics, handleVariableDeclaration(v)...)
190-
}
191-
default:
192-
207+
ast.Inspect(file, func(n ast.Node) bool {
208+
ce, ok := n.(*ast.CallExpr)
209+
if !ok {
210+
return true
193211
}
194-
}
212+
if m, ok := metricFromCallExpr(ce); ok {
213+
allMetrics = append(allMetrics, m)
214+
}
215+
return true
216+
})
195217
}
196218
}
197219
return allMetrics
@@ -201,16 +223,25 @@ func bySubsystem(metrics []metricInfo) func(i int, j int) bool {
201223
// Higher ordering comes first. If a value isn't designated here then the subsystem will be given a default of 0.
202224
// Metrics without a subsystem come first since there is no designation for the bucket they fall under
203225
subSystemSortOrder := map[string]int{
204-
"": 100,
205-
"nodepool": 10,
206-
"nodeclaims": 9,
207-
"nodes": 8,
208-
"pods": 7,
209-
"status_condition": -1,
210-
"workqueue": -1,
211-
"client_go": -1,
212-
"aws_sdk_go": -1,
213-
"leader_election": -2,
226+
"": 100,
227+
"nodepool": 10,
228+
"nodeclaims": 9,
229+
"nodeclaim_status_condition": 8,
230+
"nodeclaim_termination": 8,
231+
"nodes": 7,
232+
"node_status_condition": 6,
233+
"node_termination": 6,
234+
"pods": 5,
235+
"nodepool_status_condition": 4,
236+
"nodepool_termination": 4,
237+
"ec2nodeclass_status_condition": 3,
238+
"ec2nodeclass_termination": 3,
239+
"status_condition": -1,
240+
"termination": -1,
241+
"workqueue": -1,
242+
"client_go": -1,
243+
"aws_sdk_go": -1,
244+
"leader_election": -2,
214245
}
215246

216247
return func(i, j int) bool {
@@ -223,71 +254,135 @@ func bySubsystem(metrics []metricInfo) func(i int, j int) bool {
223254
}
224255
}
225256

226-
func handleVariableDeclaration(v *ast.GenDecl) []metricInfo {
227-
var promMetrics []metricInfo
228-
for _, spec := range v.Specs {
229-
vs, ok := spec.(*ast.ValueSpec)
257+
// perObjectStatusMetrics generates metrics for the operatorpkg status and events controllers.
258+
// These metrics are dynamically created at runtime based on the Go type parameter passed to
259+
// status.NewController[T]() and cannot be extracted via AST parsing. The object types are
260+
// determined by the status controller registrations in karpenter and karpenter-provider-aws.
261+
func perObjectStatusMetrics() []metricInfo {
262+
// Object types that have status controllers registered via status.NewController[T]()
263+
// in karpenter (nodeclaim, nodepool, node) and karpenter-provider-aws (ec2nodeclass).
264+
objectNames := []string{"nodeclaim", "nodepool", "node", "ec2nodeclass"}
265+
266+
type metricTemplate struct {
267+
subsystemSuffix string
268+
name string
269+
help string
270+
}
271+
272+
templates := []metricTemplate{
273+
{"status_condition", "transition_seconds", "The amount of time a condition was in a given state before transitioning. e.g. Alarm := P99(Updated=False) > 5 minutes"},
274+
{"status_condition", "count", "The number of a condition for a given object, type and status. e.g. Alarm := Available=False > 0"},
275+
{"status_condition", "current_status_seconds", "The current amount of time in seconds that a status condition has been in a specific state. Alarm := P99(Updated=Unknown) > 5 minutes"},
276+
{"status_condition", "transitions_total", "The count of transitions of a given object, type and status."},
277+
{"termination", "current_time_seconds", "The current amount of time in seconds that an object has been in terminating state."},
278+
{"termination", "duration_seconds", "The amount of time taken by an object to terminate completely."},
279+
}
280+
281+
var metricsOut []metricInfo
282+
for _, obj := range objectNames {
283+
for _, t := range templates {
284+
metricsOut = append(metricsOut, metricInfo{
285+
namespace: "operator",
286+
subsystem: fmt.Sprintf("%s_%s", obj, t.subsystemSuffix),
287+
name: t.name,
288+
help: t.help,
289+
})
290+
}
291+
}
292+
293+
// Deprecated generic metrics (without object name prefix) are still emitted at runtime
294+
// when emitDeprecatedMetrics is enabled on the status controller. These use group/kind
295+
// labels instead of baking the object name into the subsystem.
296+
for _, t := range templates {
297+
metricsOut = append(metricsOut, metricInfo{
298+
namespace: "operator",
299+
subsystem: t.subsystemSuffix,
300+
name: t.name,
301+
help: t.help,
302+
})
303+
}
304+
305+
// client_go metrics are registered inside operatorpkg's RegisterClientMetrics() function
306+
// using unqualified NewPrometheus* calls (same package). These can't be parsed via AST
307+
// since we only recognize qualified opmetrics.* and prometheus.* calls.
308+
metricsOut = append(metricsOut,
309+
metricInfo{name: "client_go_request_duration_seconds", help: "Request latency in seconds. Broken down by verb, group, version, kind, and subresource."},
310+
metricInfo{name: "client_go_request_total", help: "Number of HTTP requests, partitioned by status code and method."},
311+
)
312+
313+
return metricsOut
314+
}
315+
316+
// metricFromCallExpr attempts to extract metric info from a call expression.
317+
// It recognizes prometheus.New*(), opmetrics.NewPrometheus*(), and pmetrics.NewPrometheus*() calls.
318+
func metricFromCallExpr(ce *ast.CallExpr) (metricInfo, bool) {
319+
funcPkg := getFuncPackage(ce.Fun)
320+
// Determine the index of the opts argument based on the package.
321+
// prometheus.New*() calls pass opts as Args[0], while
322+
// opmetrics.NewPrometheus*() calls from operatorpkg pass
323+
// (registry, opts, labelNames), so opts is Args[1].
324+
var optsIdx int
325+
switch funcPkg {
326+
case "prometheus":
327+
optsIdx = 0
328+
case "opmetrics":
329+
optsIdx = 1
330+
default:
331+
return metricInfo{}, false
332+
}
333+
if len(ce.Args) <= optsIdx {
334+
return metricInfo{}, false
335+
}
336+
arg, ok := ce.Args[optsIdx].(*ast.CompositeLit)
337+
if !ok {
338+
return metricInfo{}, false
339+
}
340+
keyValuePairs := map[string]string{}
341+
for _, el := range arg.Elts {
342+
kv, ok := el.(*ast.KeyValueExpr)
230343
if !ok {
231344
continue
232345
}
233-
for _, v := range vs.Values {
234-
ce, ok := v.(*ast.CallExpr)
235-
if !ok {
236-
continue
237-
}
238-
funcPkg := getFuncPackage(ce.Fun)
239-
if funcPkg != "prometheus" {
240-
continue
241-
}
242-
if len(ce.Args) == 0 {
243-
continue
346+
key := fmt.Sprintf("%s", kv.Key)
347+
switch key {
348+
case "Namespace", "Subsystem", "Name", "Help":
349+
default:
350+
// skip any keys we don't care about
351+
continue
352+
}
353+
value := ""
354+
switch val := kv.Value.(type) {
355+
case *ast.BasicLit:
356+
value = val.Value
357+
case *ast.SelectorExpr:
358+
selector := fmt.Sprintf("%s.%s", val.X, val.Sel)
359+
v, err := getIdentMapping(selector)
360+
if err != nil {
361+
log.Fatalf("unresolvable selector %s for key %s: %s", selector, key, err)
244362
}
245-
arg := ce.Args[0].(*ast.CompositeLit)
246-
keyValuePairs := map[string]string{}
247-
for _, el := range arg.Elts {
248-
kv := el.(*ast.KeyValueExpr)
249-
key := fmt.Sprintf("%s", kv.Key)
250-
switch key {
251-
case "Namespace", "Subsystem", "Name", "Help":
252-
default:
253-
// skip any keys we don't care about
254-
continue
255-
}
256-
value := ""
257-
switch val := kv.Value.(type) {
258-
case *ast.BasicLit:
259-
value = val.Value
260-
case *ast.SelectorExpr:
261-
selector := fmt.Sprintf("%s.%s", val.X, val.Sel)
262-
if v, err := getIdentMapping(selector); err != nil {
263-
log.Fatalf("unsupported selector %s, %s", selector, err)
264-
} else {
265-
value = v
266-
}
267-
case *ast.Ident:
268-
if v, err := getIdentMapping(val.String()); err != nil {
269-
log.Fatal(err)
270-
} else {
271-
value = v
272-
}
273-
case *ast.BinaryExpr:
274-
value = getBinaryExpr(val)
275-
default:
276-
log.Fatalf("unsupported value %T %v", kv.Value, kv.Value)
277-
}
278-
keyValuePairs[key] = strings.TrimFunc(value, func(r rune) bool {
279-
return r == '"'
280-
})
363+
value = v
364+
case *ast.Ident:
365+
v, err := getIdentMapping(val.String())
366+
if err != nil {
367+
log.Fatalf("unresolvable identifier %q for key %s: %s", val.String(), key, err)
281368
}
282-
promMetrics = append(promMetrics, metricInfo{
283-
namespace: keyValuePairs["Namespace"],
284-
subsystem: keyValuePairs["Subsystem"],
285-
name: keyValuePairs["Name"],
286-
help: keyValuePairs["Help"],
287-
})
369+
value = v
370+
case *ast.BinaryExpr:
371+
value = getBinaryExpr(val)
372+
default:
373+
// Unknown value expression type; skip this metric.
374+
return metricInfo{}, false
288375
}
376+
keyValuePairs[key] = strings.TrimFunc(value, func(r rune) bool {
377+
return r == '"'
378+
})
289379
}
290-
return promMetrics
380+
return metricInfo{
381+
namespace: keyValuePairs["Namespace"],
382+
subsystem: keyValuePairs["Subsystem"],
383+
name: keyValuePairs["Name"],
384+
help: keyValuePairs["Help"],
385+
}, true
291386
}
292387

293388
func getFuncPackage(fun ast.Expr) string {
@@ -309,7 +404,6 @@ func getFuncPackage(fun ast.Expr) string {
309404
if _, ok := fun.(*ast.FuncLit); ok {
310405
return ""
311406
}
312-
log.Fatalf("unsupported func expression %T, %v", fun, fun)
313407
return ""
314408
}
315409

@@ -340,6 +434,7 @@ func getIdentMapping(identName string) (string, error) {
340434
"metrics.Namespace": metrics.Namespace,
341435
"Namespace": metrics.Namespace,
342436

437+
"pmetrics.Namespace": "operator",
343438
"MetricNamespace": "operator",
344439
"MetricSubsystem": "status_condition",
345440
"TerminationSubsystem": "termination",

0 commit comments

Comments
 (0)