Skip to content

Commit 108aea9

Browse files
committed
Fix race condition when terminating supervisor after restoring default signals
We'd terminate the supervisor and let it consume the shutdown timeout, so it'd start the immediate shutdown, then do some assertions and terminate the supervisor again before it had the chance to finish, and as such, the TERM signal would arrive sometimes before and sometimes after default signal handlers would have been restored, resulting in an error (harmless, but annoying) as: ``` ..solid_queue/test/test_helper.rb:51: warning: Exception in finalizer #<Proc:0x00000001083ee550 (lambda)> ..solid_queue/test/test_helper.rb:51:in `fork': SIGTERM (SignalException) from ..solid_queue/test/test_helper.rb:51:in `run_supervisor_as_fork' from ..solid_queue/test/integration/processes_lifecycle_test.rb:9:in `block in <class:ProcessLifecycleTest>' ```
1 parent 9146e56 commit 108aea9

File tree

1 file changed

+8
-1
lines changed

1 file changed

+8
-1
lines changed

test/integration/processes_lifecycle_test.rb

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ class ProcessLifecycleTest < ActiveSupport::TestCase
9696
assert_job_status(no_pause, :finished)
9797
assert_job_status(pause, :finished)
9898

99+
wait_for_process_termination_with_timeout(@pid, timeout: 1.second)
99100
assert_clean_termination
100101
end
101102

@@ -112,12 +113,13 @@ class ProcessLifecycleTest < ActiveSupport::TestCase
112113
assert_job_status(no_pause, :finished)
113114
assert_job_status(pause, :finished)
114115

116+
wait_for_process_termination_with_timeout(@pid, timeout: 1.second)
115117
assert_clean_termination
116118
end
117119

118120
test "term supervisor exceeding timeout while there are jobs in-flight" do
119121
no_pause = enqueue_store_result_job("no pause")
120-
pause = enqueue_store_result_job("pause", pause: SolidQueue.shutdown_timeout + 1.second)
122+
pause = enqueue_store_result_job("pause", pause: SolidQueue.shutdown_timeout + 10.second)
121123

122124
signal_process(@pid, :TERM, wait: 0.1.second)
123125
wait_for_jobs_to_finish_for(SolidQueue.shutdown_timeout + 0.1.second)
@@ -132,6 +134,10 @@ class ProcessLifecycleTest < ActiveSupport::TestCase
132134

133135
# The process running the long job couldn't deregister, the other did
134136
assert_registered_workers_for(:background)
137+
138+
# Now wait until the supervisor finishes for real, which will complete the cleanup
139+
wait_for_process_termination_with_timeout(@pid, timeout: 1.second)
140+
assert_clean_termination
135141
end
136142

137143
test "process some jobs that raise errors" do
@@ -188,6 +194,7 @@ def assert_clean_termination
188194
wait_for_registered_processes 0, timeout: 0.2.second
189195
assert_no_registered_processes
190196
assert_no_claimed_jobs
197+
assert_not process_exists?(@pid)
191198
end
192199

193200
def assert_registered_workers_for(*queues)

0 commit comments

Comments
 (0)