@@ -45,10 +45,18 @@ var (
4545 stableMetrics = []string {"controller_runtime" , "aws_sdk_go" , "client_go" , "leader_election" , "interruption" , "cluster_state" , "workqueue" , "karpenter_build_info" , "karpenter_nodepool_usage" , "karpenter_nodepool_limit" ,
4646 "karpenter_nodeclaims_terminated_total" , "karpenter_nodeclaims_created_total" , "karpenter_nodes_terminated_total" , "karpenter_nodes_created_total" , "karpenter_pods_startup_duration_seconds" ,
4747 "karpenter_scheduler_scheduling_duration_seconds" , "karpenter_provisioner_scheduling_duration_seconds" , "karpenter_nodepool_allowed_disruptions" , "karpenter_voluntary_disruption_decisions_total" }
48- betaMetrics = []string {"status_condition" , " cloudprovider" , "cloudprovider_batcher" , "karpenter_nodeclaims_termination_duration_seconds" , "karpenter_nodeclaims_instance_termination_duration_seconds" ,
48+ betaMetrics = []string {"cloudprovider" , "cloudprovider_batcher" , "karpenter_nodeclaims_termination_duration_seconds" , "karpenter_nodeclaims_instance_termination_duration_seconds" ,
4949 "karpenter_nodes_total_pod_requests" , "karpenter_nodes_total_pod_limits" , "karpenter_nodes_total_daemon_requests" , "karpenter_nodes_total_daemon_limits" , "karpenter_nodes_termination_duration_seconds" ,
5050 "karpenter_nodes_system_overhead" , "karpenter_nodes_allocatable" , "karpenter_pods_state" , "karpenter_scheduler_queue_depth" , "karpenter_voluntary_disruption_queue_failures_total" ,
51- "karpenter_voluntary_disruption_decision_evaluation_duration_seconds" , "karpenter_voluntary_disruption_eligible_nodes" , "karpenter_voluntary_disruption_consolidation_timeouts_total" }
51+ "karpenter_voluntary_disruption_decision_evaluation_duration_seconds" , "karpenter_voluntary_disruption_eligible_nodes" , "karpenter_voluntary_disruption_consolidation_timeouts_total" ,
52+ // Per-object status condition and termination metrics from operatorpkg
53+ "nodeclaim_status_condition" , "nodeclaim_termination" ,
54+ "node_status_condition" , "node_termination" ,
55+ "nodepool_status_condition" , "nodepool_termination" ,
56+ "ec2nodeclass_status_condition" , "ec2nodeclass_termination" }
57+ // Deprecated generic status condition and termination metrics (without object name prefix).
58+ // These are still emitted at runtime but are superseded by per-object variants.
59+ deprecatedMetrics = []string {"status_condition" , "termination" }
5260)
5361
5462func (i metricInfo ) qualifiedName () string {
@@ -69,6 +77,12 @@ func main() {
6977 allMetrics = append (allMetrics , getMetricsFromPackages (packages ... )... )
7078 }
7179
80+ // The operatorpkg status and events controllers dynamically create per-object metrics
81+ // at runtime based on the Go type parameter passed to status.NewController[T]().
82+ // These cannot be extracted via AST parsing, so we generate them from a known list of
83+ // object types that have status controllers registered.
84+ allMetrics = append (allMetrics , perObjectStatusMetrics ()... )
85+
7286 // Dedupe metrics
7387 allMetrics = lo .UniqBy (allMetrics , func (m metricInfo ) string {
7488 return fmt .Sprintf ("%s/%s/%s" , m .namespace , m .subsystem , m .name )
@@ -93,6 +107,15 @@ func main() {
93107 }
94108 sort .Slice (allMetrics , bySubsystem (allMetrics ))
95109
110+ // Sanity check: fail loudly if the metric count drops below expected.
111+ // This catches silent regressions where new identifier mappings are needed
112+ // or metric declaration patterns change. Update this threshold when metrics
113+ // are intentionally removed.
114+ const minExpectedMetrics = 100
115+ if len (allMetrics ) < minExpectedMetrics {
116+ log .Fatalf ("expected at least %d metrics but only found %d; the generator may be silently dropping metrics due to unrecognized identifiers or new declaration patterns" , minExpectedMetrics , len (allMetrics ))
117+ }
118+
96119 outputFileName := flag .Arg (flag .NArg () - 1 )
97120 f , err := os .Create (outputFileName )
98121 if err != nil {
@@ -132,6 +155,8 @@ description: >
132155 fmt .Fprintf (f , "### `%s`\n " , metric .qualifiedName ())
133156 fmt .Fprintf (f , "%s\n " , metric .help )
134157 switch {
158+ case slices .Contains (deprecatedMetrics , metric .subsystem ) || slices .Contains (deprecatedMetrics , metric .qualifiedName ()):
159+ fmt .Fprintf (f , "- Stability Level: %s\n " , "DEPRECATED" )
135160 case slices .Contains (stableMetrics , metric .subsystem ) || slices .Contains (stableMetrics , metric .qualifiedName ()):
136161 fmt .Fprintf (f , "- Stability Level: %s\n " , "STABLE" )
137162 case slices .Contains (betaMetrics , metric .subsystem ) || slices .Contains (betaMetrics , metric .qualifiedName ()):
@@ -159,7 +184,7 @@ func getPackages(root string) []*ast.Package {
159184 }
160185 // parse the packagers that we find
161186 pkgs , err := parser .ParseDir (fset , path , func (info fs.FileInfo ) bool {
162- return true
187+ return ! strings . HasSuffix ( info . Name (), "_test.go" )
163188 }, parser .AllErrors )
164189 if err != nil {
165190 log .Fatalf ("error parsing, %s" , err )
@@ -176,22 +201,19 @@ func getPackages(root string) []*ast.Package {
176201}
177202
178203func getMetricsFromPackages (packages ... * ast.Package ) []metricInfo {
179- // metrics are all package global variables
180204 var allMetrics []metricInfo
181205 for _ , pkg := range packages {
182206 for _ , file := range pkg .Files {
183- for _ , decl := range file .Decls {
184- switch v := decl .(type ) {
185- case * ast.FuncDecl :
186- // ignore
187- case * ast.GenDecl :
188- if v .Tok == token .VAR {
189- allMetrics = append (allMetrics , handleVariableDeclaration (v )... )
190- }
191- default :
192-
207+ ast .Inspect (file , func (n ast.Node ) bool {
208+ ce , ok := n .(* ast.CallExpr )
209+ if ! ok {
210+ return true
193211 }
194- }
212+ if m , ok := metricFromCallExpr (ce ); ok {
213+ allMetrics = append (allMetrics , m )
214+ }
215+ return true
216+ })
195217 }
196218 }
197219 return allMetrics
@@ -201,16 +223,25 @@ func bySubsystem(metrics []metricInfo) func(i int, j int) bool {
201223 // Higher ordering comes first. If a value isn't designated here then the subsystem will be given a default of 0.
202224 // Metrics without a subsystem come first since there is no designation for the bucket they fall under
203225 subSystemSortOrder := map [string ]int {
204- "" : 100 ,
205- "nodepool" : 10 ,
206- "nodeclaims" : 9 ,
207- "nodes" : 8 ,
208- "pods" : 7 ,
209- "status_condition" : - 1 ,
210- "workqueue" : - 1 ,
211- "client_go" : - 1 ,
212- "aws_sdk_go" : - 1 ,
213- "leader_election" : - 2 ,
226+ "" : 100 ,
227+ "nodepool" : 10 ,
228+ "nodeclaims" : 9 ,
229+ "nodeclaim_status_condition" : 8 ,
230+ "nodeclaim_termination" : 8 ,
231+ "nodes" : 7 ,
232+ "node_status_condition" : 6 ,
233+ "node_termination" : 6 ,
234+ "pods" : 5 ,
235+ "nodepool_status_condition" : 4 ,
236+ "nodepool_termination" : 4 ,
237+ "ec2nodeclass_status_condition" : 3 ,
238+ "ec2nodeclass_termination" : 3 ,
239+ "status_condition" : - 1 ,
240+ "termination" : - 1 ,
241+ "workqueue" : - 1 ,
242+ "client_go" : - 1 ,
243+ "aws_sdk_go" : - 1 ,
244+ "leader_election" : - 2 ,
214245 }
215246
216247 return func (i , j int ) bool {
@@ -223,71 +254,135 @@ func bySubsystem(metrics []metricInfo) func(i int, j int) bool {
223254 }
224255}
225256
226- func handleVariableDeclaration (v * ast.GenDecl ) []metricInfo {
227- var promMetrics []metricInfo
228- for _ , spec := range v .Specs {
229- vs , ok := spec .(* ast.ValueSpec )
257+ // perObjectStatusMetrics generates metrics for the operatorpkg status and events controllers.
258+ // These metrics are dynamically created at runtime based on the Go type parameter passed to
259+ // status.NewController[T]() and cannot be extracted via AST parsing. The object types are
260+ // determined by the status controller registrations in karpenter and karpenter-provider-aws.
261+ func perObjectStatusMetrics () []metricInfo {
262+ // Object types that have status controllers registered via status.NewController[T]()
263+ // in karpenter (nodeclaim, nodepool, node) and karpenter-provider-aws (ec2nodeclass).
264+ objectNames := []string {"nodeclaim" , "nodepool" , "node" , "ec2nodeclass" }
265+
266+ type metricTemplate struct {
267+ subsystemSuffix string
268+ name string
269+ help string
270+ }
271+
272+ templates := []metricTemplate {
273+ {"status_condition" , "transition_seconds" , "The amount of time a condition was in a given state before transitioning. e.g. Alarm := P99(Updated=False) > 5 minutes" },
274+ {"status_condition" , "count" , "The number of a condition for a given object, type and status. e.g. Alarm := Available=False > 0" },
275+ {"status_condition" , "current_status_seconds" , "The current amount of time in seconds that a status condition has been in a specific state. Alarm := P99(Updated=Unknown) > 5 minutes" },
276+ {"status_condition" , "transitions_total" , "The count of transitions of a given object, type and status." },
277+ {"termination" , "current_time_seconds" , "The current amount of time in seconds that an object has been in terminating state." },
278+ {"termination" , "duration_seconds" , "The amount of time taken by an object to terminate completely." },
279+ }
280+
281+ var metricsOut []metricInfo
282+ for _ , obj := range objectNames {
283+ for _ , t := range templates {
284+ metricsOut = append (metricsOut , metricInfo {
285+ namespace : "operator" ,
286+ subsystem : fmt .Sprintf ("%s_%s" , obj , t .subsystemSuffix ),
287+ name : t .name ,
288+ help : t .help ,
289+ })
290+ }
291+ }
292+
293+ // Deprecated generic metrics (without object name prefix) are still emitted at runtime
294+ // when emitDeprecatedMetrics is enabled on the status controller. These use group/kind
295+ // labels instead of baking the object name into the subsystem.
296+ for _ , t := range templates {
297+ metricsOut = append (metricsOut , metricInfo {
298+ namespace : "operator" ,
299+ subsystem : t .subsystemSuffix ,
300+ name : t .name ,
301+ help : t .help ,
302+ })
303+ }
304+
305+ // client_go metrics are registered inside operatorpkg's RegisterClientMetrics() function
306+ // using unqualified NewPrometheus* calls (same package). These can't be parsed via AST
307+ // since we only recognize qualified opmetrics.* and prometheus.* calls.
308+ metricsOut = append (metricsOut ,
309+ metricInfo {name : "client_go_request_duration_seconds" , help : "Request latency in seconds. Broken down by verb, group, version, kind, and subresource." },
310+ metricInfo {name : "client_go_request_total" , help : "Number of HTTP requests, partitioned by status code and method." },
311+ )
312+
313+ return metricsOut
314+ }
315+
316+ // metricFromCallExpr attempts to extract metric info from a call expression.
317+ // It recognizes prometheus.New*(), opmetrics.NewPrometheus*(), and pmetrics.NewPrometheus*() calls.
318+ func metricFromCallExpr (ce * ast.CallExpr ) (metricInfo , bool ) {
319+ funcPkg := getFuncPackage (ce .Fun )
320+ // Determine the index of the opts argument based on the package.
321+ // prometheus.New*() calls pass opts as Args[0], while
322+ // opmetrics.NewPrometheus*() calls from operatorpkg pass
323+ // (registry, opts, labelNames), so opts is Args[1].
324+ var optsIdx int
325+ switch funcPkg {
326+ case "prometheus" :
327+ optsIdx = 0
328+ case "opmetrics" :
329+ optsIdx = 1
330+ default :
331+ return metricInfo {}, false
332+ }
333+ if len (ce .Args ) <= optsIdx {
334+ return metricInfo {}, false
335+ }
336+ arg , ok := ce .Args [optsIdx ].(* ast.CompositeLit )
337+ if ! ok {
338+ return metricInfo {}, false
339+ }
340+ keyValuePairs := map [string ]string {}
341+ for _ , el := range arg .Elts {
342+ kv , ok := el .(* ast.KeyValueExpr )
230343 if ! ok {
231344 continue
232345 }
233- for _ , v := range vs .Values {
234- ce , ok := v .(* ast.CallExpr )
235- if ! ok {
236- continue
237- }
238- funcPkg := getFuncPackage (ce .Fun )
239- if funcPkg != "prometheus" {
240- continue
241- }
242- if len (ce .Args ) == 0 {
243- continue
346+ key := fmt .Sprintf ("%s" , kv .Key )
347+ switch key {
348+ case "Namespace" , "Subsystem" , "Name" , "Help" :
349+ default :
350+ // skip any keys we don't care about
351+ continue
352+ }
353+ value := ""
354+ switch val := kv .Value .(type ) {
355+ case * ast.BasicLit :
356+ value = val .Value
357+ case * ast.SelectorExpr :
358+ selector := fmt .Sprintf ("%s.%s" , val .X , val .Sel )
359+ v , err := getIdentMapping (selector )
360+ if err != nil {
361+ log .Fatalf ("unresolvable selector %s for key %s: %s" , selector , key , err )
244362 }
245- arg := ce .Args [0 ].(* ast.CompositeLit )
246- keyValuePairs := map [string ]string {}
247- for _ , el := range arg .Elts {
248- kv := el .(* ast.KeyValueExpr )
249- key := fmt .Sprintf ("%s" , kv .Key )
250- switch key {
251- case "Namespace" , "Subsystem" , "Name" , "Help" :
252- default :
253- // skip any keys we don't care about
254- continue
255- }
256- value := ""
257- switch val := kv .Value .(type ) {
258- case * ast.BasicLit :
259- value = val .Value
260- case * ast.SelectorExpr :
261- selector := fmt .Sprintf ("%s.%s" , val .X , val .Sel )
262- if v , err := getIdentMapping (selector ); err != nil {
263- log .Fatalf ("unsupported selector %s, %s" , selector , err )
264- } else {
265- value = v
266- }
267- case * ast.Ident :
268- if v , err := getIdentMapping (val .String ()); err != nil {
269- log .Fatal (err )
270- } else {
271- value = v
272- }
273- case * ast.BinaryExpr :
274- value = getBinaryExpr (val )
275- default :
276- log .Fatalf ("unsupported value %T %v" , kv .Value , kv .Value )
277- }
278- keyValuePairs [key ] = strings .TrimFunc (value , func (r rune ) bool {
279- return r == '"'
280- })
363+ value = v
364+ case * ast.Ident :
365+ v , err := getIdentMapping (val .String ())
366+ if err != nil {
367+ log .Fatalf ("unresolvable identifier %q for key %s: %s" , val .String (), key , err )
281368 }
282- promMetrics = append ( promMetrics , metricInfo {
283- namespace : keyValuePairs [ "Namespace" ],
284- subsystem : keyValuePairs [ "Subsystem" ],
285- name : keyValuePairs [ "Name" ],
286- help : keyValuePairs [ "Help" ],
287- })
369+ value = v
370+ case * ast. BinaryExpr :
371+ value = getBinaryExpr ( val )
372+ default :
373+ // Unknown value expression type; skip this metric.
374+ return metricInfo {}, false
288375 }
376+ keyValuePairs [key ] = strings .TrimFunc (value , func (r rune ) bool {
377+ return r == '"'
378+ })
289379 }
290- return promMetrics
380+ return metricInfo {
381+ namespace : keyValuePairs ["Namespace" ],
382+ subsystem : keyValuePairs ["Subsystem" ],
383+ name : keyValuePairs ["Name" ],
384+ help : keyValuePairs ["Help" ],
385+ }, true
291386}
292387
293388func getFuncPackage (fun ast.Expr ) string {
@@ -309,7 +404,6 @@ func getFuncPackage(fun ast.Expr) string {
309404 if _ , ok := fun .(* ast.FuncLit ); ok {
310405 return ""
311406 }
312- log .Fatalf ("unsupported func expression %T, %v" , fun , fun )
313407 return ""
314408}
315409
@@ -340,6 +434,7 @@ func getIdentMapping(identName string) (string, error) {
340434 "metrics.Namespace" : metrics .Namespace ,
341435 "Namespace" : metrics .Namespace ,
342436
437+ "pmetrics.Namespace" : "operator" ,
343438 "MetricNamespace" : "operator" ,
344439 "MetricSubsystem" : "status_condition" ,
345440 "TerminationSubsystem" : "termination" ,
0 commit comments