Skip to content

Commit 16584e6

Browse files
committed
[Feature] Require ResignLeadership during upgrade
1 parent 3ffda22 commit 16584e6

23 files changed

+267
-14
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
- (Feature) (Scheduler) Additional types
3838
- (Feature) Alternative Upgrade Order Feature
3939
- (Feature) (Scheduler) SchedV1 Integration
40+
- (Feature) Require ResignLeadership during upgrade
4041

4142
## [1.2.42](https://github.com/arangodb/kube-arangodb/tree/1.2.42) (2024-07-23)
4243
- (Maintenance) Go 1.22.4 & Kubernetes 1.29.6 libraries

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ Flags:
150150
--deployment.feature.backup-cleanup Cleanup imported backups if required - Required ArangoDB 3.8.0 or higher
151151
--deployment.feature.deployment-spec-defaults-restore Restore defaults from last accepted state of deployment - Required ArangoDB 3.8.0 or higher (default true)
152152
--deployment.feature.enforced-resign-leadership Enforce ResignLeadership and ensure that Leaders are moved from restarted DBServer - Required ArangoDB 3.8.0 or higher (default true)
153+
--deployment.feature.ensure-secured-resign-leadership Ensures that even if ResignLeadership job timeouted, data is still replicated on other servers - Required ArangoDB 3.8.0 or higher (default true)
153154
--deployment.feature.ephemeral-volumes Enables ephemeral volumes for apps and tmp directory - Required ArangoDB 3.8.0 or higher
154155
--deployment.feature.failover-leadership Support for leadership in fail-over mode - Required ArangoDB 3.8.0 or higher
155156
--deployment.feature.init-containers-copy-resources Copy resources spec to built-in init containers if they are not specified - Required ArangoDB 3.8.0 or higher (default true)

docs/cli/arangodb_operator.md

+1
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ Flags:
4747
--deployment.feature.backup-cleanup Cleanup imported backups if required - Required ArangoDB 3.8.0 or higher
4848
--deployment.feature.deployment-spec-defaults-restore Restore defaults from last accepted state of deployment - Required ArangoDB 3.8.0 or higher (default true)
4949
--deployment.feature.enforced-resign-leadership Enforce ResignLeadership and ensure that Leaders are moved from restarted DBServer - Required ArangoDB 3.8.0 or higher (default true)
50+
--deployment.feature.ensure-secured-resign-leadership Ensures that even if ResignLeadership job timeouted, data is still replicated on other servers - Required ArangoDB 3.8.0 or higher (default true)
5051
--deployment.feature.ephemeral-volumes Enables ephemeral volumes for apps and tmp directory - Required ArangoDB 3.8.0 or higher
5152
--deployment.feature.failover-leadership Support for leadership in fail-over mode - Required ArangoDB 3.8.0 or higher
5253
--deployment.feature.init-containers-copy-resources Copy resources spec to built-in init containers if they are not specified - Required ArangoDB 3.8.0 or higher (default true)

docs/generated/actions.md

+2
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ nav_order: 11
3737
| EncryptionKeyRemove | no | 10m0s | no | Enterprise Only | Remove the encryption key to the pool |
3838
| EncryptionKeyStatusUpdate | no | 10m0s | no | Enterprise Only | Update status of encryption propagation |
3939
| EnforceResignLeadership | no | 45m0s | yes | Community & Enterprise | Run the ResignLeadership job on DBServer and checks data compatibility after |
40+
| EnsureSecuredResignLeadership | no | 10m0s | no | Community & Enterprise | Ensures that data is still replicated on other servers |
4041
| Idle | no | 10m0s | no | Community & Enterprise | Define idle operation in case if preconditions are not meet |
4142
| JWTAdd | no | 10m0s | no | Enterprise Only | Adds new JWT to the pool |
4243
| JWTClean | no | 10m0s | no | Enterprise Only | Remove JWT key from the pool |
@@ -133,6 +134,7 @@ spec:
133134
EncryptionKeyRemove: 10m0s
134135
EncryptionKeyStatusUpdate: 10m0s
135136
EnforceResignLeadership: 45m0s
137+
EnsureSecuredResignLeadership: 10m0s
136138
Idle: 10m0s
137139
JWTAdd: 10m0s
138140
JWTClean: 10m0s

internal/actions.config.go.tmpl

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{{- $root := . -}}
22
//
3-
// Copyright 2023 ArangoDB GmbH, Cologne, Germany
3+
// Copyright 2023-2024 ArangoDB GmbH, Cologne, Germany
44
//
55
// Licensed under the Apache License, Version 2.0 (the "License");
66
// you may not use this file except in compliance with the License.

internal/actions.go.tmpl

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{{- $root := . -}}
22
//
3-
// Copyright 2016-2023 ArangoDB GmbH, Cologne, Germany
3+
// Copyright 2016-2024 ArangoDB GmbH, Cologne, Germany
44
//
55
// Licensed under the Apache License, Version 2.0 (the "License");
66
// you may not use this file except in compliance with the License.

internal/actions.register.go.tmpl

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{{- $root := . -}}
22
//
3-
// Copyright 2016-2023 ArangoDB GmbH, Cologne, Germany
3+
// Copyright 2016-2024 ArangoDB GmbH, Cologne, Germany
44
//
55
// Licensed under the Apache License, Version 2.0 (the "License");
66
// you may not use this file except in compliance with the License.

internal/actions.register.test.go.tmpl

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{{- $root := . -}}
22
//
3-
// Copyright 2016-2023 ArangoDB GmbH, Cologne, Germany
3+
// Copyright 2016-2024 ArangoDB GmbH, Cologne, Germany
44
//
55
// Licensed under the Apache License, Version 2.0 (the "License");
66
// you may not use this file except in compliance with the License.

internal/actions.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@ actions:
3333
description: Run the ResignLeadership job on DBServer and checks data compatibility after
3434
timeout: 45m
3535
optional: true
36+
EnsureSecuredResignLeadership:
37+
description: Ensures that data is still replicated on other servers
38+
timeout: 10m
3639
KillMemberPod:
3740
description: Execute Delete on Pod (put pod in Terminating state)
3841
scopes:

pkg/apis/deployment/v1/actions.generated.go

+13-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
//
2-
// Copyright 2016-2023 ArangoDB GmbH, Cologne, Germany
2+
// Copyright 2016-2024 ArangoDB GmbH, Cologne, Germany
33
//
44
// Licensed under the Apache License, Version 2.0 (the "License");
55
// you may not use this file except in compliance with the License.
@@ -101,6 +101,9 @@ const (
101101
// ActionEnforceResignLeadershipDefaultTimeout define default timeout for action ActionEnforceResignLeadership
102102
ActionEnforceResignLeadershipDefaultTimeout time.Duration = 2700 * time.Second // 45m0s
103103

104+
// ActionEnsureSecuredResignLeadershipDefaultTimeout define default timeout for action ActionEnsureSecuredResignLeadership
105+
ActionEnsureSecuredResignLeadershipDefaultTimeout time.Duration = 600 * time.Second // 10m0s
106+
104107
// ActionIdleDefaultTimeout define default timeout for action ActionIdle
105108
ActionIdleDefaultTimeout time.Duration = ActionsDefaultTimeout
106109

@@ -362,6 +365,9 @@ const (
362365
// ActionTypeEnforceResignLeadership in scopes Normal. Run the ResignLeadership job on DBServer and checks data compatibility after
363366
ActionTypeEnforceResignLeadership ActionType = "EnforceResignLeadership"
364367

368+
// ActionTypeEnsureSecuredResignLeadership in scopes Normal. Ensures that data is still replicated on other servers
369+
ActionTypeEnsureSecuredResignLeadership ActionType = "EnsureSecuredResignLeadership"
370+
365371
// ActionTypeIdle in scopes Normal. Define idle operation in case if preconditions are not meet
366372
ActionTypeIdle ActionType = "Idle"
367373

@@ -601,6 +607,8 @@ func (a ActionType) DefaultTimeout() time.Duration {
601607
return ActionEncryptionKeyStatusUpdateDefaultTimeout
602608
case ActionTypeEnforceResignLeadership:
603609
return ActionEnforceResignLeadershipDefaultTimeout
610+
case ActionTypeEnsureSecuredResignLeadership:
611+
return ActionEnsureSecuredResignLeadershipDefaultTimeout
604612
case ActionTypeIdle:
605613
return ActionIdleDefaultTimeout
606614
case ActionTypeJWTAdd:
@@ -779,6 +787,8 @@ func (a ActionType) Priority() ActionPriority {
779787
return ActionPriorityNormal
780788
case ActionTypeEnforceResignLeadership:
781789
return ActionPriorityNormal
790+
case ActionTypeEnsureSecuredResignLeadership:
791+
return ActionPriorityNormal
782792
case ActionTypeIdle:
783793
return ActionPriorityNormal
784794
case ActionTypeJWTAdd:
@@ -969,6 +979,8 @@ func (a ActionType) Optional() bool {
969979
return false
970980
case ActionTypeEnforceResignLeadership:
971981
return true
982+
case ActionTypeEnsureSecuredResignLeadership:
983+
return false
972984
case ActionTypeIdle:
973985
return false
974986
case ActionTypeJWTAdd:

pkg/apis/deployment/v2alpha1/actions.generated.go

+13-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
//
2-
// Copyright 2016-2023 ArangoDB GmbH, Cologne, Germany
2+
// Copyright 2016-2024 ArangoDB GmbH, Cologne, Germany
33
//
44
// Licensed under the Apache License, Version 2.0 (the "License");
55
// you may not use this file except in compliance with the License.
@@ -101,6 +101,9 @@ const (
101101
// ActionEnforceResignLeadershipDefaultTimeout define default timeout for action ActionEnforceResignLeadership
102102
ActionEnforceResignLeadershipDefaultTimeout time.Duration = 2700 * time.Second // 45m0s
103103

104+
// ActionEnsureSecuredResignLeadershipDefaultTimeout define default timeout for action ActionEnsureSecuredResignLeadership
105+
ActionEnsureSecuredResignLeadershipDefaultTimeout time.Duration = 600 * time.Second // 10m0s
106+
104107
// ActionIdleDefaultTimeout define default timeout for action ActionIdle
105108
ActionIdleDefaultTimeout time.Duration = ActionsDefaultTimeout
106109

@@ -362,6 +365,9 @@ const (
362365
// ActionTypeEnforceResignLeadership in scopes Normal. Run the ResignLeadership job on DBServer and checks data compatibility after
363366
ActionTypeEnforceResignLeadership ActionType = "EnforceResignLeadership"
364367

368+
// ActionTypeEnsureSecuredResignLeadership in scopes Normal. Ensures that data is still replicated on other servers
369+
ActionTypeEnsureSecuredResignLeadership ActionType = "EnsureSecuredResignLeadership"
370+
365371
// ActionTypeIdle in scopes Normal. Define idle operation in case if preconditions are not meet
366372
ActionTypeIdle ActionType = "Idle"
367373

@@ -601,6 +607,8 @@ func (a ActionType) DefaultTimeout() time.Duration {
601607
return ActionEncryptionKeyStatusUpdateDefaultTimeout
602608
case ActionTypeEnforceResignLeadership:
603609
return ActionEnforceResignLeadershipDefaultTimeout
610+
case ActionTypeEnsureSecuredResignLeadership:
611+
return ActionEnsureSecuredResignLeadershipDefaultTimeout
604612
case ActionTypeIdle:
605613
return ActionIdleDefaultTimeout
606614
case ActionTypeJWTAdd:
@@ -779,6 +787,8 @@ func (a ActionType) Priority() ActionPriority {
779787
return ActionPriorityNormal
780788
case ActionTypeEnforceResignLeadership:
781789
return ActionPriorityNormal
790+
case ActionTypeEnsureSecuredResignLeadership:
791+
return ActionPriorityNormal
782792
case ActionTypeIdle:
783793
return ActionPriorityNormal
784794
case ActionTypeJWTAdd:
@@ -969,6 +979,8 @@ func (a ActionType) Optional() bool {
969979
return false
970980
case ActionTypeEnforceResignLeadership:
971981
return true
982+
case ActionTypeEnsureSecuredResignLeadership:
983+
return false
972984
case ActionTypeIdle:
973985
return false
974986
case ActionTypeJWTAdd:

pkg/deployment/agency/state/state.go

+42
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,48 @@ func (s State) PlanLeaderServersWithFailOver() Servers {
262262
return r
263263
}
264264

265+
// IsServerWithShardBackup returns true if server can be restarted with risk
266+
func (s State) IsServerWithShardBackup(server Server) bool {
267+
for db, dbData := range s.Plan.Collections {
268+
for collection, collectionData := range dbData {
269+
for shard, shardDetails := range collectionData.Shards {
270+
if len(shardDetails) <= 1 {
271+
// RF is 1, nothing to do
272+
continue
273+
}
274+
275+
// Fund current state
276+
currentDBs, ok := s.Current.Collections[db]
277+
if !ok {
278+
continue
279+
}
280+
281+
currentCollection, ok := currentDBs[collection]
282+
if !ok {
283+
continue
284+
}
285+
286+
currentShard, ok := currentCollection[shard]
287+
if !ok {
288+
continue
289+
}
290+
291+
if len(currentShard.Servers) == 0 {
292+
continue
293+
}
294+
295+
if currentShard.Servers[0] == server {
296+
if len(currentShard.Servers) == 1 {
297+
return false
298+
}
299+
}
300+
}
301+
}
302+
}
303+
304+
return true
305+
}
306+
265307
type CollectionShardDetails []CollectionShardDetail
266308

267309
type CollectionShardDetail struct {

pkg/deployment/agency/state/state_test.go

+52
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,58 @@ func Test_IsDBServerReadyToRestart(t *testing.T) {
307307
}
308308
}
309309

310+
func Test_IsServerWithShardBackup(t *testing.T) {
311+
type testCase struct {
312+
generator Generator
313+
ready bool
314+
server Server
315+
}
316+
newDBWithCol := func(writeConcern int) CollectionGeneratorInterface {
317+
return NewDatabaseRandomGenerator().RandomCollection().WithWriteConcern(writeConcern)
318+
}
319+
tcs := map[string]testCase{
320+
"missing replica": {
321+
generator: newDBWithCol(1).WithShard().WithPlan("A", "B").WithCurrent("A").Add().Add().Add(),
322+
ready: false,
323+
server: "A",
324+
},
325+
"ready replica": {
326+
generator: newDBWithCol(1).WithShard().WithPlan("A", "B").WithCurrent("A", "B").Add().Add().Add(),
327+
ready: true,
328+
server: "A",
329+
},
330+
"not affected replica": {
331+
generator: newDBWithCol(1).WithShard().WithPlan("A", "B").WithCurrent("A").Add().Add().Add(),
332+
ready: true,
333+
server: "B",
334+
},
335+
"not affected nonexisting replica": {
336+
generator: newDBWithCol(1).WithShard().WithPlan("A", "B").WithCurrent("A").Add().Add().Add(),
337+
ready: true,
338+
server: "C",
339+
},
340+
"rf1": {
341+
generator: newDBWithCol(1).WithShard().WithPlan("A").WithCurrent("A").Add().Add().Add(),
342+
ready: true,
343+
server: "A",
344+
},
345+
}
346+
347+
for name, tc := range tcs {
348+
t.Run(name, func(t *testing.T) {
349+
s := GenerateState(t, tc.generator)
350+
351+
res := s.IsServerWithShardBackup(tc.server)
352+
353+
if tc.ready {
354+
require.True(t, res)
355+
} else {
356+
require.False(t, res)
357+
}
358+
})
359+
}
360+
}
361+
310362
func Test_GetCollectionDatabaseByID(t *testing.T) {
311363
var s DumpState
312364
require.NoError(t, json.Unmarshal(agencyDump39, &s))

pkg/deployment/features/resign_leadership.go

+13
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ package features
2222

2323
func init() {
2424
registerFeature(enforcedResignLeadership)
25+
registerFeature(ensureSecuredResignLeadership)
2526
}
2627

2728
var enforcedResignLeadership = &feature{
@@ -31,7 +32,19 @@ var enforcedResignLeadership = &feature{
3132
enabledByDefault: true,
3233
}
3334

35+
var ensureSecuredResignLeadership = &feature{
36+
name: "ensure-secured-resign-leadership",
37+
description: "Ensures that even if ResignLeadership job timeouted, data is still replicated on other servers",
38+
enterpriseRequired: false,
39+
enabledByDefault: true,
40+
}
41+
3442
// EnforcedResignLeadership returns enforced ResignLeadership.
3543
func EnforcedResignLeadership() Feature {
3644
return enforcedResignLeadership
3745
}
46+
47+
// EnsureSecuredResignLeadership returns information if data is saved on other DBServers.
48+
func EnsureSecuredResignLeadership() Feature {
49+
return ensureSecuredResignLeadership
50+
}

pkg/deployment/reconcile/action.config.generated.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
//
2-
// Copyright 2023 ArangoDB GmbH, Cologne, Germany
2+
// Copyright 2023-2024 ArangoDB GmbH, Cologne, Germany
33
//
44
// Licensed under the Apache License, Version 2.0 (the "License");
55
// you may not use this file except in compliance with the License.

pkg/deployment/reconcile/action.register.generated.go

+18-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
//
2-
// Copyright 2016-2023 ArangoDB GmbH, Cologne, Germany
2+
// Copyright 2016-2024 ArangoDB GmbH, Cologne, Germany
33
//
44
// Licensed under the Apache License, Version 2.0 (the "License");
55
// you may not use this file except in compliance with the License.
@@ -96,6 +96,9 @@ var (
9696
_ Action = &actionEnforceResignLeadership{}
9797
_ actionFactory = newEnforceResignLeadershipAction
9898

99+
_ Action = &actionEnsureSecuredResignLeadership{}
100+
_ actionFactory = newEnsureSecuredResignLeadershipAction
101+
99102
_ Action = &actionIdle{}
100103
_ actionFactory = newIdleAction
101104

@@ -619,6 +622,20 @@ func init() {
619622
registerAction(action, function)
620623
}
621624

625+
// EnsureSecuredResignLeadership
626+
{
627+
// Get Action type
628+
action := api.ActionTypeEnsureSecuredResignLeadership
629+
630+
// Get Action defition
631+
function := newEnsureSecuredResignLeadershipAction
632+
633+
// Wrap action main function
634+
635+
// Register action
636+
registerAction(action, function)
637+
}
638+
622639
// Idle
623640
{
624641
// Get Action type

pkg/deployment/reconcile/action.register.generated_test.go

+11-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
//
2-
// Copyright 2016-2023 ArangoDB GmbH, Cologne, Germany
2+
// Copyright 2016-2024 ArangoDB GmbH, Cologne, Germany
33
//
44
// Licensed under the Apache License, Version 2.0 (the "License");
55
// you may not use this file except in compliance with the License.
@@ -286,6 +286,16 @@ func Test_Actions(t *testing.T) {
286286
})
287287
})
288288

289+
t.Run("EnsureSecuredResignLeadership", func(t *testing.T) {
290+
ActionsExistence(t, api.ActionTypeEnsureSecuredResignLeadership)
291+
t.Run("Internal", func(t *testing.T) {
292+
require.False(t, api.ActionTypeEnsureSecuredResignLeadership.Internal())
293+
})
294+
t.Run("Optional", func(t *testing.T) {
295+
require.False(t, api.ActionTypeEnsureSecuredResignLeadership.Optional())
296+
})
297+
})
298+
289299
t.Run("Idle", func(t *testing.T) {
290300
ActionsExistence(t, api.ActionTypeIdle)
291301
t.Run("Internal", func(t *testing.T) {

0 commit comments

Comments
 (0)