Skip to content

Commit 2774b08

Browse files
authored
Merge pull request #160 from arangodb/feature/plan-action-timeout
Reconciliation plan-item timeout
2 parents d3e1feb + b3c6918 commit 2774b08

14 files changed

+118
-3
lines changed

pkg/deployment/reconcile/action.go

+3
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ package reconcile
2424

2525
import (
2626
"context"
27+
"time"
2728
)
2829

2930
// Action executes a single Plan item.
@@ -35,4 +36,6 @@ type Action interface {
3536
// CheckProgress checks the progress of the action.
3637
// Returns true if the action is completely finished, false otherwise.
3738
CheckProgress(ctx context.Context) (bool, error)
39+
// Timeout returns the amount of time after which this action will timeout.
40+
Timeout() time.Duration
3841
}

pkg/deployment/reconcile/action_add_member.go

+6
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ package reconcile
2424

2525
import (
2626
"context"
27+
"time"
2728

2829
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
2930
"github.com/rs/zerolog"
@@ -64,3 +65,8 @@ func (a *actionAddMember) CheckProgress(ctx context.Context) (bool, error) {
6465
// Nothing todo
6566
return true, nil
6667
}
68+
69+
// Timeout returns the amount of time after which this action will timeout.
70+
func (a *actionAddMember) Timeout() time.Duration {
71+
return addMemberTimeout
72+
}

pkg/deployment/reconcile/action_cleanout_member.go

+6
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ package reconcile
2424

2525
import (
2626
"context"
27+
"time"
2728

2829
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
2930
"github.com/rs/zerolog"
@@ -114,3 +115,8 @@ func (a *actionCleanoutMember) CheckProgress(ctx context.Context) (bool, error)
114115
// Cleanout completed
115116
return true, nil
116117
}
118+
119+
// Timeout returns the amount of time after which this action will timeout.
120+
func (a *actionCleanoutMember) Timeout() time.Duration {
121+
return cleanoutMemberTimeout
122+
}

pkg/deployment/reconcile/action_remove_member.go

+6
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ package reconcile
2424

2525
import (
2626
"context"
27+
"time"
2728

2829
"github.com/pkg/errors"
2930
"github.com/rs/zerolog"
@@ -94,3 +95,8 @@ func (a *actionRemoveMember) CheckProgress(ctx context.Context) (bool, error) {
9495
// Nothing todo
9596
return true, nil
9697
}
98+
99+
// Timeout returns the amount of time after which this action will timeout.
100+
func (a *actionRemoveMember) Timeout() time.Duration {
101+
return removeMemberTimeout
102+
}

pkg/deployment/reconcile/action_renew_tls_certificate.go

+6
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ package reconcile
2424

2525
import (
2626
"context"
27+
"time"
2728

2829
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
2930
"github.com/rs/zerolog"
@@ -69,3 +70,8 @@ func (a *renewTLSCertificateAction) Start(ctx context.Context) (bool, error) {
6970
func (a *renewTLSCertificateAction) CheckProgress(ctx context.Context) (bool, error) {
7071
return true, nil
7172
}
73+
74+
// Timeout returns the amount of time after which this action will timeout.
75+
func (a *renewTLSCertificateAction) Timeout() time.Duration {
76+
return renewTLSCertificateTimeout
77+
}

pkg/deployment/reconcile/action_rotate_member.go

+6
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ package reconcile
2424

2525
import (
2626
"context"
27+
"time"
2728

2829
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
2930
"github.com/rs/zerolog"
@@ -116,3 +117,8 @@ func (a *actionRotateMember) CheckProgress(ctx context.Context) (bool, error) {
116117
}
117118
return true, nil
118119
}
120+
121+
// Timeout returns the amount of time after which this action will timeout.
122+
func (a *actionRotateMember) Timeout() time.Duration {
123+
return rotateMemberTimeout
124+
}

pkg/deployment/reconcile/action_shutdown_member.go

+5
Original file line numberDiff line numberDiff line change
@@ -111,3 +111,8 @@ func (a *actionShutdownMember) CheckProgress(ctx context.Context) (bool, error)
111111
// Member still not shutdown, retry soon
112112
return false, nil
113113
}
114+
115+
// Timeout returns the amount of time after which this action will timeout.
116+
func (a *actionShutdownMember) Timeout() time.Duration {
117+
return shutdownMemberTimeout
118+
}

pkg/deployment/reconcile/action_upgrade_member.go

+6
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ package reconcile
2424

2525
import (
2626
"context"
27+
"time"
2728

2829
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
2930
"github.com/rs/zerolog"
@@ -126,3 +127,8 @@ func (a *actionUpgradeMember) CheckProgress(ctx context.Context) (bool, error) {
126127
}
127128
return isUpgrading, nil
128129
}
130+
131+
// Timeout returns the amount of time after which this action will timeout.
132+
func (a *actionUpgradeMember) Timeout() time.Duration {
133+
return upgradeMemberTimeout
134+
}

pkg/deployment/reconcile/action_wait_for_member_up.go

+6
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ package reconcile
2424

2525
import (
2626
"context"
27+
"time"
2728

2829
driver "github.com/arangodb/go-driver"
2930
"github.com/arangodb/go-driver/agency"
@@ -164,3 +165,8 @@ func (a *actionWaitForMemberUp) checkProgressArangoSync(ctx context.Context) (bo
164165
}
165166
return true, nil
166167
}
168+
169+
// Timeout returns the amount of time after which this action will timeout.
170+
func (a *actionWaitForMemberUp) Timeout() time.Duration {
171+
return waitForMemberUpTimeout
172+
}

pkg/deployment/reconcile/context.go

+3
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,9 @@ type Context interface {
5454
GetAgencyClients(ctx context.Context, predicate func(id string) bool) ([]driver.Connection, error)
5555
// GetSyncServerClient returns a cached client for a specific arangosync server.
5656
GetSyncServerClient(ctx context.Context, group api.ServerGroup, id string) (client.API, error)
57+
// CreateEvent creates a given event.
58+
// On error, the error is logged.
59+
CreateEvent(evt *v1.Event)
5760
// CreateMember adds a new member to the given group.
5861
// If ID is non-empty, it will be used, otherwise a new ID is created.
5962
CreateMember(group api.ServerGroup, id string) error

pkg/deployment/reconcile/plan_executor.go

+18-2
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,13 @@ package reconcile
2525
import (
2626
"context"
2727
"fmt"
28+
"time"
2829

30+
"github.com/rs/zerolog"
2931
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3032

3133
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
32-
"github.com/rs/zerolog"
34+
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
3335
)
3436

3537
// ExecutePlan tries to execute the plan as far as possible.
@@ -106,7 +108,21 @@ func (d *Reconciler) ExecutePlan(ctx context.Context) (bool, error) {
106108
}
107109
log.Debug().Bool("ready", ready).Msg("Action CheckProgress completed")
108110
if !ready {
109-
// Not ready check, come back soon
111+
// Not ready yet, check timeout
112+
deadline := planAction.CreationTime.Add(action.Timeout())
113+
if time.Now().After(deadline) {
114+
// Timeout has expired
115+
log.Warn().Msg("Action not finished in time. Removing the entire plan")
116+
d.context.CreateEvent(k8sutil.NewPlanTimeoutEvent(d.context.GetAPIObject(), string(planAction.Type), planAction.MemberID, planAction.Group.AsRole()))
117+
// Replace plan with empty one and save it.
118+
status.Plan = api.Plan{}
119+
if err := d.context.UpdateStatus(status); err != nil {
120+
log.Debug().Err(err).Msg("Failed to update CR status")
121+
return false, maskAny(err)
122+
}
123+
return true, nil
124+
}
125+
// Timeout not yet expired, come back soon
110126
return true, nil
111127
}
112128
// Continue with next action

pkg/deployment/reconcile/timeouts.go

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
//
2+
// DISCLAIMER
3+
//
4+
// Copyright 2018 ArangoDB GmbH, Cologne, Germany
5+
//
6+
// Licensed under the Apache License, Version 2.0 (the "License");
7+
// you may not use this file except in compliance with the License.
8+
// You may obtain a copy of the License at
9+
//
10+
// http://www.apache.org/licenses/LICENSE-2.0
11+
//
12+
// Unless required by applicable law or agreed to in writing, software
13+
// distributed under the License is distributed on an "AS IS" BASIS,
14+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
// See the License for the specific language governing permissions and
16+
// limitations under the License.
17+
//
18+
// Copyright holder is ArangoDB GmbH, Cologne, Germany
19+
//
20+
// Author Ewout Prangsma
21+
//
22+
23+
package reconcile
24+
25+
import "time"
26+
27+
const (
28+
addMemberTimeout = time.Minute * 5
29+
cleanoutMemberTimeout = time.Hour * 12
30+
removeMemberTimeout = time.Minute * 15
31+
renewTLSCertificateTimeout = time.Minute * 30
32+
rotateMemberTimeout = time.Minute * 30
33+
shutdownMemberTimeout = time.Minute * 30
34+
upgradeMemberTimeout = time.Hour * 6
35+
waitForMemberUpTimeout = time.Minute * 15
36+
)

pkg/deployment/resources/context.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ type Context interface {
6262
GetLifecycleImage() string
6363
// GetNamespace returns the namespace that contains the deployment
6464
GetNamespace() string
65-
// createEvent creates a given event.
65+
// CreateEvent creates a given event.
6666
// On error, the error is logged.
6767
CreateEvent(evt *v1.Event)
6868
// GetOwnedPods returns a list of all pods owned by the deployment.

pkg/util/k8sutil/events.go

+10
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,16 @@ func NewAccessPackageDeletedEvent(apiObject APIObject, apSecretName string) *v1.
145145
return event
146146
}
147147

148+
// NewPlanTimeoutEvent creates an event indicating that an item on a reconciliation plan did not
149+
// finish before its deadline.
150+
func NewPlanTimeoutEvent(apiObject APIObject, itemType, memberID, role string) *v1.Event {
151+
event := newDeploymentEvent(apiObject)
152+
event.Type = v1.EventTypeNormal
153+
event.Reason = "Reconciliation Plan Timeout"
154+
event.Message = fmt.Sprintf("An plan item of type %s or member %s with role %s did not finish in time", itemType, memberID, role)
155+
return event
156+
}
157+
148158
// NewErrorEvent creates an even of type error.
149159
func NewErrorEvent(reason string, err error, apiObject APIObject) *v1.Event {
150160
event := newDeploymentEvent(apiObject)

0 commit comments

Comments
 (0)