Skip to content

Commit c9eeb3c

Browse files
Use token based approach for system-agent
Reduce the footprint of the system-agent RBAC Per each cluster there will be created: - 1 system-agent ServiceAccount Per each plan there will be temporarily created: - 1 Role with access to all plan secrets for each machine - 1 Rolebinging for the role and the cluster system-agent ServiceAccount On plan completion/failure the role and rolebinding will be rewoked Per each machine there will be created: - 1 Secret for the system-agent authentication, with unique JWT bound to the secret existence in the API server, and a namespace/name pointer to the plan secret - 1 Secret for the plan execution Signed-off-by: Danil-Grigorev <[email protected]>
1 parent 54fc8ff commit c9eeb3c

File tree

10 files changed

+205
-167
lines changed

10 files changed

+205
-167
lines changed

charts/rancher-turtles/templates/rancher-turtles-exp-etcdrestore-components.yaml

+6
Original file line numberDiff line numberDiff line change
@@ -422,6 +422,12 @@ rules:
422422
- patch
423423
- update
424424
- watch
425+
- apiGroups:
426+
- ""
427+
resources:
428+
- serviceaccounts/token
429+
verbs:
430+
- create
425431
- apiGroups:
426432
- authorization.k8s.io
427433
resources:

exp/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ export CLUSTER_NAMESPACE=default
6262
export CLUSTER_NAME=rke2
6363
export ETCD_MACHINE_SNAPSHOT_NAME="<snapshot_name_from_the_output>"
6464

65-
envsubst < etcdrestore/examples/etcd-restore.yaml | kubectl apply -f -
65+
envsubst < exp/etcdrestore/examples/etcd-restore.yaml | kubectl apply -f -
6666
```
6767

6868
## Cleanup

exp/etcdrestore/config/rbac/role.yaml

+6
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,12 @@ rules:
1919
- patch
2020
- update
2121
- watch
22+
- apiGroups:
23+
- ""
24+
resources:
25+
- serviceaccounts/token
26+
verbs:
27+
- create
2228
- apiGroups:
2329
- authorization.k8s.io
2430
resources:

exp/etcdrestore/controllers/etcdsnapshotrestore_controller.go

+31-14
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@ import (
3939
snapshotrestorev1 "github.com/rancher/turtles/exp/etcdrestore/api/v1alpha1"
4040
)
4141

42-
// InitMachine is a filter matching on init machine of the ETCD snapshot
43-
func InitMachine(etcdMachineSnapshot *snapshotrestorev1.ETCDMachineSnapshot) collections.Func {
42+
// initMachine is a filter matching on init machine of the ETCD snapshot
43+
func initMachine(etcdMachineSnapshot *snapshotrestorev1.ETCDMachineSnapshot) collections.Func {
4444
return func(machine *clusterv1.Machine) bool {
4545
return machine.Name == etcdMachineSnapshot.Spec.MachineName
4646
}
@@ -104,6 +104,7 @@ type scope struct {
104104
//+kubebuilder:rbac:groups=cluster.x-k8s.io,resources=clusters/status,verbs=get;list;watch;create;update;patch;delete
105105
//+kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machines,verbs=get;list;watch;create;update;patch;delete
106106
//+kubebuilder:rbac:groups="",resources=secrets;events;configmaps;serviceaccounts,verbs=get;list;watch;create;update;patch;delete
107+
//+kubebuilder:rbac:groups="",resources=serviceaccounts/token,verbs=create
107108
//+kubebuilder:rbac:groups="rbac.authorization.k8s.io",resources=roles;rolebindings,verbs=get;list;watch;create;update;patch;delete
108109
//+kubebuilder:rbac:groups="management.cattle.io",resources=*,verbs=get;list;watch;create;update;patch;delete
109110
//+kubebuilder:rbac:groups=bootstrap.cluster.x-k8s.io,resources=rke2configs;rke2configs/status;rke2configs/finalizers,verbs=get;list;watch;create;update;patch;delete
@@ -159,7 +160,7 @@ func (r *ETCDSnapshotRestoreReconciler) reconcileNormal(ctx context.Context, etc
159160
return ctrl.Result{RequeueAfter: 30 * time.Second}, nil
160161
}
161162

162-
if scope.machines.Filter(InitMachine(scope.etcdMachineSnapshot)).Len() != 1 {
163+
if scope.machines.Filter(initMachine(scope.etcdMachineSnapshot)).Len() != 1 {
163164
return ctrl.Result{}, fmt.Errorf(
164165
"init machine %s for snapshot %s is not found",
165166
scope.etcdMachineSnapshot.Spec.MachineName,
@@ -191,15 +192,13 @@ func (r *ETCDSnapshotRestoreReconciler) reconcileNormal(ctx context.Context, etc
191192

192193
return ctrl.Result{}, nil
193194
case snapshotrestorev1.ETCDSnapshotRestorePhaseStarted:
194-
etcdSnapshotRestore.Status.Phase = snapshotrestorev1.ETCDSnapshotRestorePhaseShutdown
195-
196-
return ctrl.Result{}, nil
195+
return r.preparePlanPermissions(ctx, scope, etcdSnapshotRestore)
197196
case snapshotrestorev1.ETCDSnapshotRestorePhaseShutdown:
198197
// Stop RKE2 on all the machines.
199198
return r.stopRKE2OnAllMachines(ctx, scope, etcdSnapshotRestore)
200199
case snapshotrestorev1.ETCDSnapshotRestorePhaseRunning:
201200
// Restore the etcd snapshot on the init machine.
202-
return r.restoreSnaphotOnInitMachine(ctx, scope, etcdSnapshotRestore)
201+
return r.restoreSnapshotOnInitMachine(ctx, scope, etcdSnapshotRestore)
203202
case snapshotrestorev1.ETCDSnapshotRestorePhaseAgentRestart:
204203
// Start RKE2 on all the machines.
205204
return r.startRKE2OnAllMachines(ctx, scope, etcdSnapshotRestore)
@@ -212,7 +211,7 @@ func (r *ETCDSnapshotRestoreReconciler) reconcileNormal(ctx context.Context, etc
212211
case snapshotrestorev1.ETCDSnapshotRestorePhaseJoinAgents:
213212
return r.waitForMachinesToJoin(ctx, scope, etcdSnapshotRestore)
214213
case snapshotrestorev1.ETCDSnapshotRestorePhaseFinished, snapshotrestorev1.ETCDSnapshotRestorePhaseFailed:
215-
return ctrl.Result{}, nil
214+
return r.revokePlanPermissions(ctx, scope, etcdSnapshotRestore)
216215
}
217216

218217
return ctrl.Result{}, nil
@@ -251,6 +250,24 @@ func initScope(ctx context.Context, c client.Client, etcdSnapshotRestore *snapsh
251250
}, nil
252251
}
253252

253+
func (r *ETCDSnapshotRestoreReconciler) preparePlanPermissions(ctx context.Context, scope *scope, etcdSnapshotRestore *snapshotrestorev1.ETCDSnapshotRestore) (ctrl.Result, error) {
254+
if err := Plan(ctx, r.Client, "restore"+etcdSnapshotRestore.Name, scope.machines.Newest(), scope.machines).Permit(ctx); err != nil {
255+
return ctrl.Result{}, err
256+
}
257+
258+
etcdSnapshotRestore.Status.Phase = snapshotrestorev1.ETCDSnapshotRestorePhaseShutdown
259+
260+
return ctrl.Result{}, nil
261+
}
262+
263+
func (r *ETCDSnapshotRestoreReconciler) revokePlanPermissions(ctx context.Context, scope *scope, etcdSnapshotRestore *snapshotrestorev1.ETCDSnapshotRestore) (ctrl.Result, error) {
264+
if err := Plan(ctx, r.Client, "restore"+etcdSnapshotRestore.Name, scope.machines.Newest(), scope.machines).Revoke(ctx); err != nil {
265+
return ctrl.Result{}, err
266+
}
267+
268+
return ctrl.Result{}, nil
269+
}
270+
254271
func (r *ETCDSnapshotRestoreReconciler) stopRKE2OnAllMachines(ctx context.Context, scope *scope, etcdSnapshotRestore *snapshotrestorev1.ETCDSnapshotRestore) (ctrl.Result, error) {
255272
log := log.FromContext(ctx)
256273

@@ -259,7 +276,7 @@ func (r *ETCDSnapshotRestoreReconciler) stopRKE2OnAllMachines(ctx context.Contex
259276
log.Info("Stopping RKE2 on machine", "machine", machine.Name)
260277

261278
// Get the plan secret for the machine.
262-
applied, err := Plan(ctx, r.Client, machine).Apply(ctx, RKE2KillAll())
279+
applied, err := Plan(ctx, r.Client, "restore"+etcdSnapshotRestore.Name, machine, scope.machines).Apply(ctx, RKE2KillAll())
263280
if err != nil {
264281
return ctrl.Result{}, fmt.Errorf("failed to get plan secret for machine: %w", err)
265282
}
@@ -286,15 +303,15 @@ func (r *ETCDSnapshotRestoreReconciler) stopRKE2OnAllMachines(ctx context.Contex
286303
return ctrl.Result{}, nil
287304
}
288305

289-
func (r *ETCDSnapshotRestoreReconciler) restoreSnaphotOnInitMachine(ctx context.Context, scope *scope, etcdSnapshotRestore *snapshotrestorev1.ETCDSnapshotRestore) (ctrl.Result, error) {
306+
func (r *ETCDSnapshotRestoreReconciler) restoreSnapshotOnInitMachine(ctx context.Context, scope *scope, etcdSnapshotRestore *snapshotrestorev1.ETCDSnapshotRestore) (ctrl.Result, error) {
290307
log := log.FromContext(ctx)
291308

292-
initMachine := scope.machines.Filter(InitMachine(scope.etcdMachineSnapshot)).UnsortedList()[0]
309+
initMachine := scope.machines.Filter(initMachine(scope.etcdMachineSnapshot)).UnsortedList()[0]
293310

294311
log.Info("Filling plan secret with etcd restore instructions", "machine", initMachine.Name)
295312

296313
// Get the plan secret for the machine.
297-
applied, err := Plan(ctx, r.Client, initMachine).Apply(
314+
applied, err := Plan(ctx, r.Client, "restore"+etcdSnapshotRestore.Name, initMachine, scope.machines).Apply(
298315
ctx,
299316
RemoveServerURL(),
300317
ManifestRemoval(),
@@ -318,7 +335,7 @@ func (r *ETCDSnapshotRestoreReconciler) restoreSnaphotOnInitMachine(ctx context.
318335
func (r *ETCDSnapshotRestoreReconciler) startRKE2OnAllMachines(ctx context.Context, scope *scope, etcdSnapshotRestore *snapshotrestorev1.ETCDSnapshotRestore) (ctrl.Result, error) {
319336
log := log.FromContext(ctx)
320337

321-
initMachine := scope.machines.Filter(InitMachine(scope.etcdMachineSnapshot)).UnsortedList()[0]
338+
initMachine := scope.machines.Filter(initMachine(scope.etcdMachineSnapshot)).UnsortedList()[0]
322339

323340
// TODO: other registration methods
324341
initMachineIP := getInternalMachineIP(initMachine)
@@ -350,7 +367,7 @@ func (r *ETCDSnapshotRestoreReconciler) startRKE2OnAllMachines(ctx context.Conte
350367
StartRKE2())
351368
}
352369

353-
applied, err := Plan(ctx, r.Client, machine).Apply(ctx, instructions...)
370+
applied, err := Plan(ctx, r.Client, "restore"+etcdSnapshotRestore.Name, machine, scope.machines).Apply(ctx, instructions...)
354371
if err != nil {
355372
return ctrl.Result{}, fmt.Errorf("failed to patch plan secret: %w", err)
356373
} else if !applied.Finished {

exp/etcdrestore/controllers/planner.go

+91-7
Original file line numberDiff line numberDiff line change
@@ -33,17 +33,21 @@ import (
3333
bootstrapv1 "github.com/rancher/cluster-api-provider-rke2/bootstrap/api/v1beta1"
3434
snapshotrestorev1 "github.com/rancher/turtles/exp/etcdrestore/api/v1alpha1"
3535
corev1 "k8s.io/api/core/v1"
36+
rbacv1 "k8s.io/api/rbac/v1"
3637
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
38+
"sigs.k8s.io/cluster-api/util/collections"
3739
"sigs.k8s.io/controller-runtime/pkg/client"
3840
"sigs.k8s.io/controller-runtime/pkg/log"
3941
)
4042

4143
// Planner is responsible for executing instructions on the underlying machine host
4244
// in the specified order, and collecting output from executed steps.
4345
type Planner struct {
46+
Name string
4447
client.Client
45-
machine *clusterv1.Machine
46-
secret *corev1.Secret
48+
machine *clusterv1.Machine
49+
machines collections.Machines
50+
secret *corev1.Secret
4751
}
4852

4953
// Instructions is a one time operation, used to perform shell commands on the host
@@ -64,16 +68,20 @@ type plan struct {
6468
}
6569

6670
// Plan is initializing Planner, used to perform instructions in a specific order and collect results
67-
func Plan(ctx context.Context, c client.Client, machine *clusterv1.Machine) *Planner {
71+
func Plan(ctx context.Context, c client.Client, name string, machine *clusterv1.Machine, machines collections.Machines) *Planner {
6872
return &Planner{
69-
Client: c,
70-
machine: machine,
71-
secret: initSecret(machine, map[string][]byte{}),
73+
Client: c,
74+
Name: name,
75+
machine: machine,
76+
machines: machines,
77+
secret: initSecret(machine, map[string][]byte{}),
7278
}
7379
}
7480

7581
func initSecret(machine *clusterv1.Machine, data map[string][]byte) *corev1.Secret {
76-
planSecretName := strings.Join([]string{machine.Spec.Bootstrap.ConfigRef.Name, "rke2config", "plan"}, "-")
82+
kind := strings.ToLower(machine.Spec.Bootstrap.ConfigRef.Kind)
83+
name := machine.Spec.Bootstrap.ConfigRef.Name
84+
planSecretName := strings.Join([]string{name, kind, "plan"}, "-")
7785

7886
return &corev1.Secret{
7987
TypeMeta: metav1.TypeMeta{
@@ -83,6 +91,12 @@ func initSecret(machine *clusterv1.Machine, data map[string][]byte) *corev1.Secr
8391
ObjectMeta: metav1.ObjectMeta{
8492
Namespace: machine.Namespace,
8593
Name: planSecretName,
94+
OwnerReferences: []metav1.OwnerReference{{
95+
Name: machine.Name,
96+
Kind: "Machine",
97+
UID: machine.UID,
98+
APIVersion: clusterv1.GroupVersion.String(),
99+
}},
86100
},
87101
Data: data,
88102
}
@@ -247,6 +261,76 @@ func (p *Planner) applied(plan, appliedChecksum []byte) bool {
247261
return planHash == string(appliedChecksum)
248262
}
249263

264+
// planRole returns the Role for the Plan.
265+
func (p *Planner) planRole() *rbacv1.Role {
266+
secrets := []string{}
267+
for _, machine := range p.machines.UnsortedList() {
268+
planSecretName := strings.Join([]string{machine.Spec.Bootstrap.ConfigRef.Name, "rke2config", "plan"}, "-")
269+
secrets = append(secrets, planSecretName)
270+
}
271+
272+
return &rbacv1.Role{
273+
ObjectMeta: metav1.ObjectMeta{
274+
Name: p.machine.Labels[clusterv1.ClusterNameLabel] + "-" + p.Name,
275+
Namespace: p.machine.Namespace,
276+
},
277+
Rules: []rbacv1.PolicyRule{
278+
{
279+
Verbs: []string{"watch", "get", "update", "list"},
280+
APIGroups: []string{""},
281+
Resources: []string{"secrets"},
282+
ResourceNames: secrets,
283+
},
284+
},
285+
}
286+
}
287+
288+
// planRoleBinding creates a RoleBinding for the plan.
289+
func (p *Planner) planRoleBinding() *rbacv1.RoleBinding {
290+
return &rbacv1.RoleBinding{
291+
ObjectMeta: metav1.ObjectMeta{
292+
Name: p.machine.Labels[clusterv1.ClusterNameLabel] + "-" + p.Name,
293+
Namespace: p.machine.Namespace,
294+
},
295+
Subjects: []rbacv1.Subject{
296+
{
297+
Kind: "ServiceAccount",
298+
Name: p.machine.Labels[clusterv1.ClusterNameLabel] + "-system-agent",
299+
Namespace: p.machine.Namespace,
300+
},
301+
},
302+
RoleRef: rbacv1.RoleRef{
303+
APIGroup: rbacv1.GroupName,
304+
Kind: "Role",
305+
Name: p.machine.Labels[clusterv1.ClusterNameLabel] + "-" + p.Name,
306+
},
307+
}
308+
}
309+
310+
func (p *Planner) Permit(ctx context.Context) error {
311+
if err := p.Create(ctx, p.planRole()); client.IgnoreAlreadyExists(err) != nil {
312+
return fmt.Errorf("unable to create plan role: %w", err)
313+
}
314+
315+
if err := p.Create(ctx, p.planRoleBinding()); client.IgnoreAlreadyExists(err) != nil {
316+
return fmt.Errorf("unable to create plan role binding: %w", err)
317+
}
318+
319+
return nil
320+
}
321+
322+
func (p *Planner) Revoke(ctx context.Context) error {
323+
if err := p.Delete(ctx, p.planRole()); client.IgnoreNotFound(err) != nil {
324+
return fmt.Errorf("unable to delete plan role: %w", err)
325+
}
326+
327+
if err := p.Delete(ctx, p.planRoleBinding()); client.IgnoreNotFound(err) != nil {
328+
return fmt.Errorf("unable to delete plan role binding: %w", err)
329+
}
330+
331+
return nil
332+
}
333+
250334
func (p *Planner) updatePlanSecret(ctx context.Context, data []byte) error {
251335
log := log.FromContext(ctx)
252336

exp/etcdrestore/examples/etcd-restore.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
apiversion: turtles-capi.cattle.io/v1alpha1
1+
apiVersion: turtles-capi.cattle.io/v1alpha1
22
kind: ETCDSnapshotRestore
33
metadata:
44
name: example-restore

exp/etcdrestore/go.mod

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ require (
1515
k8s.io/client-go v0.29.4
1616
k8s.io/component-base v0.29.4
1717
k8s.io/klog/v2 v2.110.1
18+
k8s.io/utils v0.0.0-20231127182322-b307cd553661
1819
sigs.k8s.io/cluster-api v1.7.3
1920
sigs.k8s.io/cluster-api-operator v0.13.0
2021
sigs.k8s.io/controller-runtime v0.17.3
@@ -86,7 +87,6 @@ require (
8687
k8s.io/apiextensions-apiserver v0.29.4 // indirect
8788
k8s.io/cluster-bootstrap v0.29.3 // indirect
8889
k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 // indirect
89-
k8s.io/utils v0.0.0-20231127182322-b307cd553661 // indirect
9090
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
9191
sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect
9292
sigs.k8s.io/yaml v1.4.0 // indirect

0 commit comments

Comments
 (0)