Skip to content

Commit 5fe764b

Browse files
authored
Add pprof labels in processes and for lifecycles (#19202)
Use pprof labelling to help identify goroutines with stacks. Signed-off-by: Andrew Thornton <[email protected]>
1 parent e48f3b0 commit 5fe764b

File tree

4 files changed

+55
-48
lines changed

4 files changed

+55
-48
lines changed

modules/graceful/manager.go

+16-34
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ package graceful
66

77
import (
88
"context"
9+
"runtime/pprof"
910
"sync"
1011
"time"
1112

@@ -62,15 +63,13 @@ type WithCallback func(callback func())
6263
// Similarly the callback function provided to atTerminate must return once termination is complete.
6364
// Please note that use of the atShutdown and atTerminate callbacks will create go-routines that will wait till their respective signals
6465
// - users must therefore be careful to only call these as necessary.
65-
// If run is not expected to run indefinitely RunWithShutdownChan is likely to be more appropriate.
6666
type RunnableWithShutdownFns func(atShutdown, atTerminate func(func()))
6767

6868
// RunWithShutdownFns takes a function that has both atShutdown and atTerminate callbacks
6969
// After the callback to atShutdown is called and is complete, the main function must return.
7070
// Similarly the callback function provided to atTerminate must return once termination is complete.
7171
// Please note that use of the atShutdown and atTerminate callbacks will create go-routines that will wait till their respective signals
7272
// - users must therefore be careful to only call these as necessary.
73-
// If run is not expected to run indefinitely RunWithShutdownChan is likely to be more appropriate.
7473
func (g *Manager) RunWithShutdownFns(run RunnableWithShutdownFns) {
7574
g.runningServerWaitGroup.Add(1)
7675
defer g.runningServerWaitGroup.Done()
@@ -98,32 +97,6 @@ func (g *Manager) RunWithShutdownFns(run RunnableWithShutdownFns) {
9897
})
9998
}
10099

101-
// RunnableWithShutdownChan is a runnable with functions to run at shutdown and terminate.
102-
// After the atShutdown channel is closed, the main function must return once shutdown is complete.
103-
// (Optionally IsHammer may be waited for instead however, this should be avoided if possible.)
104-
// The callback function provided to atTerminate must return once termination is complete.
105-
// Please note that use of the atTerminate function will create a go-routine that will wait till terminate - users must therefore be careful to only call this as necessary.
106-
type RunnableWithShutdownChan func(atShutdown <-chan struct{}, atTerminate WithCallback)
107-
108-
// RunWithShutdownChan takes a function that has channel to watch for shutdown and atTerminate callbacks
109-
// After the atShutdown channel is closed, the main function must return once shutdown is complete.
110-
// (Optionally IsHammer may be waited for instead however, this should be avoided if possible.)
111-
// The callback function provided to atTerminate must return once termination is complete.
112-
// Please note that use of the atTerminate function will create a go-routine that will wait till terminate - users must therefore be careful to only call this as necessary.
113-
func (g *Manager) RunWithShutdownChan(run RunnableWithShutdownChan) {
114-
g.runningServerWaitGroup.Add(1)
115-
defer g.runningServerWaitGroup.Done()
116-
defer func() {
117-
if err := recover(); err != nil {
118-
log.Critical("PANIC during RunWithShutdownChan: %v\nStacktrace: %s", err, log.Stack(2))
119-
g.doShutdown()
120-
}
121-
}()
122-
run(g.IsShutdown(), func(atTerminate func()) {
123-
g.RunAtTerminate(atTerminate)
124-
})
125-
}
126-
127100
// RunWithShutdownContext takes a function that has a context to watch for shutdown.
128101
// After the provided context is Done(), the main function must return once shutdown is complete.
129102
// (Optionally the HammerContext may be obtained and waited for however, this should be avoided if possible.)
@@ -136,7 +109,9 @@ func (g *Manager) RunWithShutdownContext(run func(context.Context)) {
136109
g.doShutdown()
137110
}
138111
}()
139-
run(g.ShutdownContext())
112+
ctx := g.ShutdownContext()
113+
pprof.SetGoroutineLabels(ctx) // We don't have a label to restore back to but I think this is fine
114+
run(ctx)
140115
}
141116

142117
// RunAtTerminate adds to the terminate wait group and creates a go-routine to run the provided function at termination
@@ -198,6 +173,8 @@ func (g *Manager) doShutdown() {
198173
}
199174
g.lock.Lock()
200175
g.shutdownCtxCancel()
176+
atShutdownCtx := pprof.WithLabels(g.hammerCtx, pprof.Labels("graceful-lifecycle", "post-shutdown"))
177+
pprof.SetGoroutineLabels(atShutdownCtx)
201178
for _, fn := range g.toRunAtShutdown {
202179
go fn()
203180
}
@@ -214,7 +191,7 @@ func (g *Manager) doShutdown() {
214191
g.doTerminate()
215192
g.WaitForTerminate()
216193
g.lock.Lock()
217-
g.doneCtxCancel()
194+
g.managerCtxCancel()
218195
g.lock.Unlock()
219196
}()
220197
}
@@ -227,6 +204,8 @@ func (g *Manager) doHammerTime(d time.Duration) {
227204
default:
228205
log.Warn("Setting Hammer condition")
229206
g.hammerCtxCancel()
207+
atHammerCtx := pprof.WithLabels(g.terminateCtx, pprof.Labels("graceful-lifecycle", "post-hammer"))
208+
pprof.SetGoroutineLabels(atHammerCtx)
230209
for _, fn := range g.toRunAtHammer {
231210
go fn()
232211
}
@@ -244,6 +223,9 @@ func (g *Manager) doTerminate() {
244223
default:
245224
log.Warn("Terminating")
246225
g.terminateCtxCancel()
226+
atTerminateCtx := pprof.WithLabels(g.managerCtx, pprof.Labels("graceful-lifecycle", "post-terminate"))
227+
pprof.SetGoroutineLabels(atTerminateCtx)
228+
247229
for _, fn := range g.toRunAtTerminate {
248230
go fn()
249231
}
@@ -331,20 +313,20 @@ func (g *Manager) InformCleanup() {
331313

332314
// Done allows the manager to be viewed as a context.Context, it returns a channel that is closed when the server is finished terminating
333315
func (g *Manager) Done() <-chan struct{} {
334-
return g.doneCtx.Done()
316+
return g.managerCtx.Done()
335317
}
336318

337319
// Err allows the manager to be viewed as a context.Context done at Terminate
338320
func (g *Manager) Err() error {
339-
return g.doneCtx.Err()
321+
return g.managerCtx.Err()
340322
}
341323

342324
// Value allows the manager to be viewed as a context.Context done at Terminate
343325
func (g *Manager) Value(key interface{}) interface{} {
344-
return g.doneCtx.Value(key)
326+
return g.managerCtx.Value(key)
345327
}
346328

347329
// Deadline returns nil as there is no fixed Deadline for the manager, it allows the manager to be viewed as a context.Context
348330
func (g *Manager) Deadline() (deadline time.Time, ok bool) {
349-
return g.doneCtx.Deadline()
331+
return g.managerCtx.Deadline()
350332
}

modules/graceful/manager_unix.go

+14-3
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212
"errors"
1313
"os"
1414
"os/signal"
15+
"runtime/pprof"
1516
"sync"
1617
"syscall"
1718
"time"
@@ -29,11 +30,11 @@ type Manager struct {
2930
shutdownCtx context.Context
3031
hammerCtx context.Context
3132
terminateCtx context.Context
32-
doneCtx context.Context
33+
managerCtx context.Context
3334
shutdownCtxCancel context.CancelFunc
3435
hammerCtxCancel context.CancelFunc
3536
terminateCtxCancel context.CancelFunc
36-
doneCtxCancel context.CancelFunc
37+
managerCtxCancel context.CancelFunc
3738
runningServerWaitGroup sync.WaitGroup
3839
createServerWaitGroup sync.WaitGroup
3940
terminateWaitGroup sync.WaitGroup
@@ -58,7 +59,17 @@ func (g *Manager) start(ctx context.Context) {
5859
g.terminateCtx, g.terminateCtxCancel = context.WithCancel(ctx)
5960
g.shutdownCtx, g.shutdownCtxCancel = context.WithCancel(ctx)
6061
g.hammerCtx, g.hammerCtxCancel = context.WithCancel(ctx)
61-
g.doneCtx, g.doneCtxCancel = context.WithCancel(ctx)
62+
g.managerCtx, g.managerCtxCancel = context.WithCancel(ctx)
63+
64+
// Next add pprof labels to these contexts
65+
g.terminateCtx = pprof.WithLabels(g.terminateCtx, pprof.Labels("graceful-lifecycle", "with-terminate"))
66+
g.shutdownCtx = pprof.WithLabels(g.shutdownCtx, pprof.Labels("graceful-lifecycle", "with-shutdown"))
67+
g.hammerCtx = pprof.WithLabels(g.hammerCtx, pprof.Labels("graceful-lifecycle", "with-hammer"))
68+
g.managerCtx = pprof.WithLabels(g.managerCtx, pprof.Labels("graceful-lifecycle", "with-manager"))
69+
70+
// Now label this and all goroutines created by this goroutine with the graceful-lifecycle manager
71+
pprof.SetGoroutineLabels(g.managerCtx)
72+
defer pprof.SetGoroutineLabels(ctx)
6273

6374
// Set the running state & handle signals
6475
g.setState(stateRunning)

modules/graceful/manager_windows.go

+14-3
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ package graceful
1111
import (
1212
"context"
1313
"os"
14+
"runtime/pprof"
1415
"strconv"
1516
"sync"
1617
"time"
@@ -40,11 +41,11 @@ type Manager struct {
4041
shutdownCtx context.Context
4142
hammerCtx context.Context
4243
terminateCtx context.Context
43-
doneCtx context.Context
44+
managerCtx context.Context
4445
shutdownCtxCancel context.CancelFunc
4546
hammerCtxCancel context.CancelFunc
4647
terminateCtxCancel context.CancelFunc
47-
doneCtxCancel context.CancelFunc
48+
managerCtxCancel context.CancelFunc
4849
runningServerWaitGroup sync.WaitGroup
4950
createServerWaitGroup sync.WaitGroup
5051
terminateWaitGroup sync.WaitGroup
@@ -71,7 +72,17 @@ func (g *Manager) start() {
7172
g.terminateCtx, g.terminateCtxCancel = context.WithCancel(g.ctx)
7273
g.shutdownCtx, g.shutdownCtxCancel = context.WithCancel(g.ctx)
7374
g.hammerCtx, g.hammerCtxCancel = context.WithCancel(g.ctx)
74-
g.doneCtx, g.doneCtxCancel = context.WithCancel(g.ctx)
75+
g.managerCtx, g.managerCtxCancel = context.WithCancel(g.ctx)
76+
77+
// Next add pprof labels to these contexts
78+
g.terminateCtx = pprof.WithLabels(g.terminateCtx, pprof.Labels("graceful-lifecycle", "with-terminate"))
79+
g.shutdownCtx = pprof.WithLabels(g.shutdownCtx, pprof.Labels("graceful-lifecycle", "with-shutdown"))
80+
g.hammerCtx = pprof.WithLabels(g.hammerCtx, pprof.Labels("graceful-lifecycle", "with-hammer"))
81+
g.managerCtx = pprof.WithLabels(g.managerCtx, pprof.Labels("graceful-lifecycle", "with-manager"))
82+
83+
// Now label this and all goroutines created by this goroutine with the graceful-lifecycle manager
84+
pprof.SetGoroutineLabels(g.managerCtx)
85+
defer pprof.SetGoroutineLabels(g.ctx)
7586

7687
// Make channels
7788
g.shutdownRequested = make(chan struct{})

modules/process/manager.go

+11-8
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"fmt"
1212
"io"
1313
"os/exec"
14+
"runtime/pprof"
1415
"sort"
1516
"strconv"
1617
"sync"
@@ -66,11 +67,9 @@ func GetManager() *Manager {
6667
// Most processes will not need to use the cancel function but there will be cases whereby you want to cancel the process but not immediately remove it from the
6768
// process table.
6869
func (pm *Manager) AddContext(parent context.Context, description string) (ctx context.Context, cancel context.CancelFunc, finished FinishedFunc) {
69-
parentPID := GetParentPID(parent)
70-
7170
ctx, cancel = context.WithCancel(parent)
7271

73-
pid, finished := pm.Add(parentPID, description, cancel)
72+
ctx, pid, finished := pm.Add(ctx, description, cancel)
7473

7574
return &Context{
7675
Context: ctx,
@@ -87,11 +86,9 @@ func (pm *Manager) AddContext(parent context.Context, description string) (ctx c
8786
// Most processes will not need to use the cancel function but there will be cases whereby you want to cancel the process but not immediately remove it from the
8887
// process table.
8988
func (pm *Manager) AddContextTimeout(parent context.Context, timeout time.Duration, description string) (ctx context.Context, cancel context.CancelFunc, finshed FinishedFunc) {
90-
parentPID := GetParentPID(parent)
91-
9289
ctx, cancel = context.WithTimeout(parent, timeout)
9390

94-
pid, finshed := pm.Add(parentPID, description, cancel)
91+
ctx, pid, finshed := pm.Add(ctx, description, cancel)
9592

9693
return &Context{
9794
Context: ctx,
@@ -100,7 +97,9 @@ func (pm *Manager) AddContextTimeout(parent context.Context, timeout time.Durati
10097
}
10198

10299
// Add create a new process
103-
func (pm *Manager) Add(parentPID IDType, description string, cancel context.CancelFunc) (IDType, FinishedFunc) {
100+
func (pm *Manager) Add(ctx context.Context, description string, cancel context.CancelFunc) (context.Context, IDType, FinishedFunc) {
101+
parentPID := GetParentPID(ctx)
102+
104103
pm.mutex.Lock()
105104
start, pid := pm.nextPID()
106105

@@ -120,6 +119,7 @@ func (pm *Manager) Add(parentPID IDType, description string, cancel context.Canc
120119
finished := func() {
121120
cancel()
122121
pm.remove(process)
122+
pprof.SetGoroutineLabels(ctx)
123123
}
124124

125125
if parent != nil {
@@ -128,7 +128,10 @@ func (pm *Manager) Add(parentPID IDType, description string, cancel context.Canc
128128
pm.processes[pid] = process
129129
pm.mutex.Unlock()
130130

131-
return pid, finished
131+
pprofCtx := pprof.WithLabels(ctx, pprof.Labels("process-description", description, "ppid", string(parentPID), "pid", string(pid)))
132+
pprof.SetGoroutineLabels(pprofCtx)
133+
134+
return pprofCtx, pid, finished
132135
}
133136

134137
// nextPID will return the next available PID. pm.mutex should already be locked.

0 commit comments

Comments
 (0)