Skip to content

Commit f55c94e

Browse files
authored
Add NGINX reload counters (#1049)
* Add NGINX reload counters
1 parent 4c3185d commit f55c94e

File tree

6 files changed

+184
-21
lines changed

6 files changed

+184
-21
lines changed

docs/monitoring.md

+9
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,15 @@ NGINX Kubernetes Gateway exports the following metrics:
8686
- These metrics have the namespace `nginx_kubernetes_gateway`, and include the label `class` which is set to the
8787
Gateway class of NKG. For example, `nginx_kubernetes_gateway_connections_accepted{class="nginx"}`.
8888

89+
- NGINX Kubernetes Gateway metrics:
90+
- nginx_reloads_total. Number of successful NGINX reloads.
91+
- nginx_reload_errors_total. Number of unsuccessful NGINX reloads.
92+
- nginx_stale_config. 1 means NKG failed to configure NGINX with the latest version of the configuration, which means
93+
NGINX is running with a stale version.
94+
- nginx_last_reload_milliseconds. Duration in milliseconds of NGINX reloads (histogram).
95+
- These metrics have the namespace `nginx_kubernetes_gateway`, and include the label `class` which is set to the
96+
Gateway class of NKG. For example, `nginx_kubernetes_gateway_nginx_reloads_total{class="nginx"}`.
97+
8998
- [controller-runtime](https://github.com/kubernetes-sigs/controller-runtime) metrics. These include:
9099
- Total number of reconciliation errors per controller
91100
- Length of reconcile queue per controller

internal/mode/static/manager.go

+31-16
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,16 @@ func StartManager(cfg config.Config) error {
125125
return fmt.Errorf("cannot clear NGINX configuration folders: %w", err)
126126
}
127127

128+
// Ensure NGINX is running before registering metrics & starting the manager.
129+
if err := ngxruntime.EnsureNginxRunning(ctx); err != nil {
130+
return fmt.Errorf("NGINX is not running: %w", err)
131+
}
132+
133+
mgrCollector, err := createAndRegisterMetricsCollectors(cfg.MetricsConfig.Enabled, cfg.GatewayClassName)
134+
if err != nil {
135+
return fmt.Errorf("cannot create and register metrics collectors: %w", err)
136+
}
137+
128138
statusUpdater := status.NewUpdater(status.UpdaterConfig{
129139
GatewayCtlrName: cfg.GatewayCtlrName,
130140
GatewayClassName: cfg.GatewayClassName,
@@ -146,7 +156,7 @@ func StartManager(cfg config.Config) error {
146156
cfg.Logger.WithName("nginxFileManager"),
147157
file.NewStdLibOSFileManager(),
148158
),
149-
nginxRuntimeMgr: ngxruntime.NewManagerImpl(),
159+
nginxRuntimeMgr: ngxruntime.NewManagerImpl(mgrCollector),
150160
statusUpdater: statusUpdater,
151161
eventRecorder: recorder,
152162
healthChecker: hc,
@@ -193,17 +203,6 @@ func StartManager(cfg config.Config) error {
193203
}
194204
}
195205

196-
// Ensure NGINX is running before registering metrics & starting the manager.
197-
if err := ngxruntime.EnsureNginxRunning(ctx); err != nil {
198-
return fmt.Errorf("NGINX is not running: %w", err)
199-
}
200-
201-
if cfg.MetricsConfig.Enabled {
202-
if err := configureNginxMetrics(cfg.GatewayClassName); err != nil {
203-
return err
204-
}
205-
}
206-
207206
cfg.Logger.Info("Starting manager")
208207
return mgr.Start(ctx)
209208
}
@@ -353,13 +352,29 @@ func setInitialConfig(
353352
return updateControlPlane(&config, logger, eventRecorder, configName, logLevelSetter)
354353
}
355354

356-
func configureNginxMetrics(gatewayClassName string) error {
357-
constLabels := map[string]string{"class": gatewayClassName}
355+
// createAndRegisterMetricsCollectors creates the NGINX status and NGINX runtime manager collectors, registers them,
356+
// and returns the runtime manager collector to be used in the nginxRuntimeMgr.
357+
func createAndRegisterMetricsCollectors(metricsEnabled bool, gwClassName string) (ngxruntime.ManagerCollector, error) {
358+
if !metricsEnabled {
359+
// return a no-op collector to avoid nil pointer errors when metrics are disabled
360+
return nkgmetrics.NewManagerNoopCollector(), nil
361+
}
362+
constLabels := map[string]string{"class": gwClassName}
363+
358364
ngxCollector, err := nkgmetrics.NewNginxMetricsCollector(constLabels)
359365
if err != nil {
360-
return fmt.Errorf("cannot get NGINX metrics: %w", err)
366+
return nil, fmt.Errorf("cannot create NGINX status metrics collector: %w", err)
367+
}
368+
if err := metrics.Registry.Register(ngxCollector); err != nil {
369+
return nil, fmt.Errorf("failed to register NGINX status metrics collector: %w", err)
361370
}
362-
return metrics.Registry.Register(ngxCollector)
371+
372+
mgrCollector := nkgmetrics.NewManagerMetricsCollector(constLabels)
373+
if err := metrics.Registry.Register(mgrCollector); err != nil {
374+
return nil, fmt.Errorf("failed to register NGINX manager runtime metrics collector: %w", err)
375+
}
376+
377+
return mgrCollector, nil
363378
}
364379

365380
func getMetricsOptions(cfg config.MetricsConfig) metricsserver.Options {
+115
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
package metrics
2+
3+
import (
4+
"time"
5+
6+
"github.com/prometheus/client_golang/prometheus"
7+
)
8+
9+
// ManagerMetricsCollector implements ManagerCollector interface and prometheus.Collector interface
10+
type ManagerMetricsCollector struct {
11+
// Metrics
12+
reloadsTotal prometheus.Counter
13+
reloadsError prometheus.Counter
14+
configStale prometheus.Gauge
15+
reloadsDuration prometheus.Histogram
16+
}
17+
18+
// NewManagerMetricsCollector creates a new ManagerMetricsCollector
19+
func NewManagerMetricsCollector(constLabels map[string]string) *ManagerMetricsCollector {
20+
nc := &ManagerMetricsCollector{
21+
reloadsTotal: prometheus.NewCounter(
22+
prometheus.CounterOpts{
23+
Name: "nginx_reloads_total",
24+
Namespace: metricsNamespace,
25+
Help: "Number of successful NGINX reloads",
26+
ConstLabels: constLabels,
27+
}),
28+
reloadsError: prometheus.NewCounter(
29+
prometheus.CounterOpts{
30+
Name: "nginx_reload_errors_total",
31+
Namespace: metricsNamespace,
32+
Help: "Number of unsuccessful NGINX reloads",
33+
ConstLabels: constLabels,
34+
},
35+
),
36+
configStale: prometheus.NewGauge(
37+
prometheus.GaugeOpts{
38+
Name: "nginx_stale_config",
39+
Namespace: metricsNamespace,
40+
Help: "Indicates if NGINX is not serving the latest configuration.",
41+
ConstLabels: constLabels,
42+
},
43+
),
44+
reloadsDuration: prometheus.NewHistogram(
45+
prometheus.HistogramOpts{
46+
Name: "nginx_reloads_milliseconds",
47+
Namespace: metricsNamespace,
48+
Help: "Duration in milliseconds of NGINX reloads",
49+
ConstLabels: constLabels,
50+
Buckets: []float64{500, 1000, 5000, 10000, 30000},
51+
},
52+
),
53+
}
54+
return nc
55+
}
56+
57+
// IncNginxReloadCount increments the counter of successful NGINX reloads and sets the stale config status to false.
58+
func (mc *ManagerMetricsCollector) IncReloadCount() {
59+
mc.reloadsTotal.Inc()
60+
mc.updateConfigStaleStatus(false)
61+
}
62+
63+
// IncNginxReloadErrors increments the counter of NGINX reload errors and sets the stale config status to true.
64+
func (mc *ManagerMetricsCollector) IncReloadErrors() {
65+
mc.reloadsError.Inc()
66+
mc.updateConfigStaleStatus(true)
67+
}
68+
69+
// updateConfigStaleStatus updates the last NGINX reload status metric.
70+
func (mc *ManagerMetricsCollector) updateConfigStaleStatus(stale bool) {
71+
var status float64
72+
if stale {
73+
status = 1.0
74+
}
75+
mc.configStale.Set(status)
76+
}
77+
78+
// ObserveLastReloadTime adds the last NGINX reload time to the histogram.
79+
func (mc *ManagerMetricsCollector) ObserveLastReloadTime(duration time.Duration) {
80+
mc.reloadsDuration.Observe(float64(duration / time.Millisecond))
81+
}
82+
83+
// Describe implements prometheus.Collector interface Describe method.
84+
func (mc *ManagerMetricsCollector) Describe(ch chan<- *prometheus.Desc) {
85+
mc.reloadsTotal.Describe(ch)
86+
mc.reloadsError.Describe(ch)
87+
mc.configStale.Describe(ch)
88+
mc.reloadsDuration.Describe(ch)
89+
}
90+
91+
// Collect implements the prometheus.Collector interface Collect method.
92+
func (mc *ManagerMetricsCollector) Collect(ch chan<- prometheus.Metric) {
93+
mc.reloadsTotal.Collect(ch)
94+
mc.reloadsError.Collect(ch)
95+
mc.configStale.Collect(ch)
96+
mc.reloadsDuration.Collect(ch)
97+
}
98+
99+
// ManagerNoopCollector is a no-op collector that will implement ManagerCollector interface.
100+
// Used to initialize the ManagerCollector when metrics are disabled to avoid nil pointer errors.
101+
type ManagerNoopCollector struct{}
102+
103+
// NewManagerNoopCollector creates a no-op collector that implements ManagerCollector interface.
104+
func NewManagerNoopCollector() *ManagerNoopCollector {
105+
return &ManagerNoopCollector{}
106+
}
107+
108+
// IncReloadCount implements a no-op IncReloadCount.
109+
func (mc *ManagerNoopCollector) IncReloadCount() {}
110+
111+
// IncReloadErrors implements a no-op IncReloadErrors.
112+
func (mc *ManagerNoopCollector) IncReloadErrors() {}
113+
114+
// ObserveLastReloadTime implements a no-op ObserveLastReloadTime.
115+
func (mc *ManagerNoopCollector) ObserveLastReloadTime(_ time.Duration) {}
+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
package metrics
2+
3+
// nolint:gosec // flagged as potential hardcoded credentials, but is not sensitive
4+
const metricsNamespace = "nginx_kubernetes_gateway"

internal/mode/static/metrics/nginx.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ func NewNginxMetricsCollector(constLabels map[string]string) (prometheus.Collect
2424
if err != nil {
2525
return nil, err
2626
}
27-
return nginxCollector.NewNginxCollector(client, "nginx_kubernetes_gateway", constLabels), nil
27+
return nginxCollector.NewNginxCollector(client, metricsNamespace, constLabels), nil
2828
}
2929

3030
// getSocketClient gets an http.Client with a unix socket transport.

internal/mode/static/nginx/runtime/manager.go

+24-4
Original file line numberDiff line numberDiff line change
@@ -41,19 +41,29 @@ type Manager interface {
4141
Reload(ctx context.Context, configVersion int) error
4242
}
4343

44+
// ManagerCollector is an interface for the metrics of the NGINX runtime manager.
45+
type ManagerCollector interface {
46+
IncReloadCount()
47+
IncReloadErrors()
48+
ObserveLastReloadTime(ms time.Duration)
49+
}
50+
4451
// ManagerImpl implements Manager.
4552
type ManagerImpl struct {
46-
verifyClient *verifyClient
53+
verifyClient *verifyClient
54+
managerCollector ManagerCollector
4755
}
4856

4957
// NewManagerImpl creates a new ManagerImpl.
50-
func NewManagerImpl() *ManagerImpl {
58+
func NewManagerImpl(managerCollector ManagerCollector) *ManagerImpl {
5159
return &ManagerImpl{
52-
verifyClient: newVerifyClient(nginxReloadTimeout),
60+
verifyClient: newVerifyClient(nginxReloadTimeout),
61+
managerCollector: managerCollector,
5362
}
5463
}
5564

5665
func (m *ManagerImpl) Reload(ctx context.Context, configVersion int) error {
66+
start := time.Now()
5767
// We find the main NGINX PID on every reload because it will change if the NGINX container is restarted.
5868
pid, err := findMainProcess(ctx, os.Stat, os.ReadFile, pidFileTimeout)
5969
if err != nil {
@@ -69,6 +79,7 @@ func (m *ManagerImpl) Reload(ctx context.Context, configVersion int) error {
6979
// send HUP signal to the NGINX main process reload configuration
7080
// See https://nginx.org/en/docs/control.html
7181
if err := syscall.Kill(pid, syscall.SIGHUP); err != nil {
82+
m.managerCollector.IncReloadErrors()
7283
return fmt.Errorf("failed to send the HUP signal to NGINX main: %w", err)
7384
}
7485

@@ -79,10 +90,19 @@ func (m *ManagerImpl) Reload(ctx context.Context, configVersion int) error {
7990
os.ReadFile,
8091
childProcsTimeout,
8192
); err != nil {
93+
m.managerCollector.IncReloadErrors()
8294
return fmt.Errorf(noNewWorkersErrFmt, configVersion, err)
8395
}
8496

85-
return m.verifyClient.waitForCorrectVersion(ctx, configVersion)
97+
if err = m.verifyClient.waitForCorrectVersion(ctx, configVersion); err != nil {
98+
m.managerCollector.IncReloadErrors()
99+
return err
100+
}
101+
m.managerCollector.IncReloadCount()
102+
103+
finish := time.Now()
104+
m.managerCollector.ObserveLastReloadTime(finish.Sub(start))
105+
return nil
86106
}
87107

88108
// EnsureNginxRunning ensures NGINX is running by locating the main process.

0 commit comments

Comments
 (0)