Skip to content

Commit 5cc2206

Browse files
committed
[Feature] Parametrize Scheduling Graceful Duration
1 parent a4d7331 commit 5cc2206

File tree

6 files changed

+39
-1
lines changed

6 files changed

+39
-1
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
- (Maintenance) Update Go to 1.22.2
2121
- (Feature) Object Checksum
2222
- (Bugfix) Use Rendered Spec in case of scheduling compare
23+
- (Feature) Parametrize Scheduling Graceful Duration
2324

2425
## [1.2.39](https://github.com/arangodb/kube-arangodb/tree/1.2.39) (2024-03-11)
2526
- (Feature) Extract Scheduler API

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@ Flags:
198198
--timeout.backup-upload duration The request timeout to the ArangoDB during uploading files (default 5m0s)
199199
--timeout.force-delete-pod-grace-period duration Default period when ArangoDB Pod should be forcefully removed after all containers were stopped - set to 0 to disable forceful removals (default 15m0s)
200200
--timeout.k8s duration The request timeout to the kubernetes (default 2s)
201+
--timeout.pod-scheduling-grace-period duration Default period when ArangoDB Pod should be deleted in case of scheduling info change - set to 0 to disable (default 15s)
201202
--timeout.reconciliation duration The reconciliation timeout to the ArangoDB CR (default 1m0s)
202203
--timeout.shard-rebuild duration Timeout after which particular out-synced shard is considered as failed and rebuild is triggered (default 1h0m0s)
203204
--timeout.shard-rebuild-retry duration Timeout after which rebuild shards retry flow is triggered (default 4h0m0s)

cmd/cmd.go

+3
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ var (
157157
backupArangoD time.Duration
158158
backupUploadArangoD time.Duration
159159
forcePodDeletionGracePeriod time.Duration
160+
podSchedulingGracePeriod time.Duration
160161
}
161162
operatorImageDiscovery struct {
162163
timeout time.Duration
@@ -226,6 +227,7 @@ func init() {
226227
f.DurationVar(&operatorTimeouts.backupArangoD, "timeout.backup-arangod", globals.BackupDefaultArangoClientTimeout, "The request timeout to the ArangoDB during backup calls")
227228
f.DurationVar(&operatorTimeouts.backupUploadArangoD, "timeout.backup-upload", globals.BackupUploadArangoClientTimeout, "The request timeout to the ArangoDB during uploading files")
228229
f.DurationVar(&operatorTimeouts.forcePodDeletionGracePeriod, "timeout.force-delete-pod-grace-period", globals.DefaultForcePodDeletionGracePeriodTimeout, "Default period when ArangoDB Pod should be forcefully removed after all containers were stopped - set to 0 to disable forceful removals")
230+
f.DurationVar(&operatorTimeouts.podSchedulingGracePeriod, "timeout.pod-scheduling-grace-period", globals.DefaultPodSchedulingGracePeriod, "Default period when ArangoDB Pod should be deleted in case of scheduling info change - set to 0 to disable")
229231
f.DurationVar(&shutdownOptions.delay, "shutdown.delay", defaultShutdownDelay, "The delay before running shutdown handlers")
230232
f.DurationVar(&shutdownOptions.timeout, "shutdown.timeout", defaultShutdownTimeout, "Timeout for shutdown handlers")
231233
f.DurationVar(&operatorReconciliationRetry.delay, "operator.reconciliation.retry.delay", globals.DefaultOperatorUpdateRetryDelay, "Delay between Object Update operations in the Reconciliation loop")
@@ -294,6 +296,7 @@ func executeMain(cmd *cobra.Command, args []string) {
294296
globals.GetGlobalTimeouts().BackupArangoClientTimeout().Set(operatorTimeouts.backupArangoD)
295297
globals.GetGlobalTimeouts().BackupArangoClientUploadTimeout().Set(operatorTimeouts.backupUploadArangoD)
296298
globals.GetGlobalTimeouts().ForcePodDeletionGracePeriodTimeout().Set(operatorTimeouts.forcePodDeletionGracePeriod)
299+
globals.GetGlobalTimeouts().PodSchedulingGracePeriod().Set(operatorTimeouts.podSchedulingGracePeriod)
297300

298301
globals.GetGlobals().Retry().OperatorUpdateRetryDelay().Set(operatorReconciliationRetry.delay)
299302
globals.GetGlobals().Retry().OperatorUpdateRetryCount().Set(operatorReconciliationRetry.count)

pkg/deployment/reconcile/plan_builder_member_pod_scheduling_failure.go

+21
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,14 @@ package reconcile
2323
import (
2424
"context"
2525
"reflect"
26+
"time"
2627

2728
core "k8s.io/api/core/v1"
2829

2930
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
3031
"github.com/arangodb/kube-arangodb/pkg/deployment/actions"
3132
"github.com/arangodb/kube-arangodb/pkg/util"
33+
"github.com/arangodb/kube-arangodb/pkg/util/globals"
3234
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
3335
)
3436

@@ -38,6 +40,12 @@ func (r *Reconciler) createMemberPodSchedulingFailurePlan(ctx context.Context,
3840
_ k8sutil.APIObject, spec api.DeploymentSpec, status api.DeploymentStatus, context PlanBuilderContext) api.Plan {
3941

4042
var p api.Plan
43+
44+
if globals.GetGlobalTimeouts().PodSchedulingGracePeriod().Get() == 0 {
45+
// Scheduling grace period is not enabled
46+
return nil
47+
}
48+
4149
if !status.Conditions.IsTrue(api.ConditionTypePodSchedulingFailure) {
4250
return p
4351
}
@@ -55,6 +63,19 @@ func (r *Reconciler) createMemberPodSchedulingFailurePlan(ctx context.Context,
5563
continue
5664
}
5765

66+
if c, ok := m.Member.Conditions.Get(api.ConditionTypeScheduled); !ok {
67+
// Action cant proceed if pod is not scheduled
68+
continue
69+
} else if c.LastTransitionTime.IsZero() {
70+
// LastTransitionTime is not set
71+
continue
72+
} else {
73+
if time.Since(c.LastTransitionTime.Time) <= globals.GetGlobalTimeouts().PodSchedulingGracePeriod().Get() {
74+
// In grace period
75+
continue
76+
}
77+
}
78+
5879
imageInfo, imageFound := context.SelectImageForMember(spec, status, m.Member)
5980
if !imageFound {
6081
l.Warn("could not find image for already created member")

pkg/deployment/resources/pod_inspector.go

+5
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,11 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter
393393
nextInterval = nextInterval.ReduceTo(recheckSoonPodInspectorInterval)
394394
}
395395
} else {
396+
if memberStatus.Conditions.Update(api.ConditionTypeScheduled, false, "Pod is not scheduled", "") {
397+
updateMemberStatusNeeded = true
398+
nextInterval = nextInterval.ReduceTo(recheckSoonPodInspectorInterval)
399+
}
400+
396401
if k8sutil.IsPodNotScheduledFor(pod, podScheduleTimeout) {
397402
// Pod cannot be scheduled for to long
398403
log.Str("pod-name", pod.GetName()).Debug("Pod scheduling timeout")

pkg/util/globals/global.go

+8-1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ const (
2929
DefaultArangoDCheckTimeout = time.Second * 2
3030
DefaultReconciliationTimeout = time.Minute
3131
DefaultForcePodDeletionGracePeriodTimeout = 15 * time.Minute
32+
DefaultPodSchedulingGracePeriod = 15 * time.Second
3233

3334
BackupDefaultArangoClientTimeout = 30 * time.Second
3435
BackupUploadArangoClientTimeout = 300 * time.Second
@@ -61,6 +62,7 @@ var globalObj = &globals{
6162
backupArangoClientTimeout: NewTimeout(BackupDefaultArangoClientTimeout),
6263
backupArangoClientUploadTimeout: NewTimeout(BackupUploadArangoClientTimeout),
6364
forcePodDeletionGracePeriodTimeout: NewTimeout(DefaultForcePodDeletionGracePeriodTimeout),
65+
podSchedulingGracePeriod: NewTimeout(DefaultPodSchedulingGracePeriod),
6466
},
6567
kubernetes: &globalKubernetes{
6668
requestBatchSize: NewInt64(DefaultKubernetesRequestBatchSize),
@@ -147,6 +149,7 @@ type GlobalTimeouts interface {
147149
Agency() Timeout
148150

149151
ForcePodDeletionGracePeriodTimeout() Timeout
152+
PodSchedulingGracePeriod() Timeout
150153

151154
BackupArangoClientTimeout() Timeout
152155
BackupArangoClientUploadTimeout() Timeout
@@ -156,13 +159,17 @@ type globalTimeouts struct {
156159
requests, arangod, reconciliation, arangodCheck, agency, shardRebuild, shardRebuildRetry Timeout
157160
backupArangoClientTimeout Timeout
158161
backupArangoClientUploadTimeout Timeout
159-
forcePodDeletionGracePeriodTimeout Timeout
162+
forcePodDeletionGracePeriodTimeout, podSchedulingGracePeriod Timeout
160163
}
161164

162165
func (g *globalTimeouts) ForcePodDeletionGracePeriodTimeout() Timeout {
163166
return g.forcePodDeletionGracePeriodTimeout
164167
}
165168

169+
func (g *globalTimeouts) PodSchedulingGracePeriod() Timeout {
170+
return g.podSchedulingGracePeriod
171+
}
172+
166173
func (g *globalTimeouts) Agency() Timeout {
167174
return g.agency
168175
}

0 commit comments

Comments
 (0)