Skip to content

Commit 06e7f3a

Browse files
committed
Workaround for de-duplication of cloud task
1 parent 42dc9a1 commit 06e7f3a

File tree

3 files changed

+40
-25
lines changed

3 files changed

+40
-25
lines changed

iam.tf

+6-6
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,10 @@ resource "google_project_iam_custom_role" "manage_vm_instances" {
3030
permissions = ["compute.instances.get", "compute.instances.start", "compute.instances.stop", "compute.instances.delete", "compute.instances.create", "compute.instances.setMetadata", "compute.instances.setTags", "compute.instances.setServiceAccount"]
3131
}
3232

33-
resource "google_project_iam_custom_role" "create_cloud_task" {
34-
role_id = "CreateCloudTask"
35-
title = "Create a Cloud Task"
36-
permissions = ["cloudtasks.tasks.create"]
33+
resource "google_project_iam_custom_role" "create_delete_cloud_task" {
34+
role_id = "CreateDeleteCloudTask"
35+
title = "Create/Delete a Cloud Task"
36+
permissions = ["cloudtasks.tasks.create", "cloudtasks.tasks.delete"]
3737
}
3838

3939
resource "google_project_iam_custom_role" "create_vm_from_instance_template" {
@@ -71,10 +71,10 @@ resource "google_project_iam_member" "manage_vm_instances_member" {
7171
}
7272
}
7373

74-
resource "google_project_iam_member" "create_cloud_task_member" {
74+
resource "google_project_iam_member" "create_delete_cloud_task_member" {
7575
project = local.projectId
7676
member = "serviceAccount:${google_service_account.autoscaler_sa.email}"
77-
role = google_project_iam_custom_role.create_cloud_task.id
77+
role = google_project_iam_custom_role.create_delete_cloud_task.id
7878

7979
# DOES NOT WORK
8080
#condition {

runner-autoscaler/pkg/srv.go

+20-9
Original file line numberDiff line numberDiff line change
@@ -499,15 +499,15 @@ func (s *Autoscaler) GenerateRunnerJitConfig(ctx context.Context, url string, ru
499499
}
500500
}
501501

502-
func (s *Autoscaler) createCallbackTaskWithToken(ctx context.Context, url string, secret string, job Job, delay time.Duration) error {
502+
func (s *Autoscaler) CreateCallbackTaskWithToken(ctx context.Context, url string, secret string, job Job, delay time.Duration) error {
503503

504504
data, _ := json.Marshal(job)
505505
now := timestamppb.Now()
506506
now.Seconds += int64(delay.Seconds())
507507
req := &taskspb.CreateTaskRequest{
508508
Parent: s.conf.TaskQueue,
509509
Task: &taskspb.Task{
510-
Name: fmt.Sprintf("%s/tasks/%d", s.conf.TaskQueue, job.Id),
510+
Name: fmt.Sprintf("%s/tasks/%d-0", s.conf.TaskQueue, job.Id),
511511
DispatchDeadline: &durationpb.Duration{
512512
Seconds: 120, // the timeout of the cloud task callback - must be greater the time it takes to start the VM
513513
Nanos: 0,
@@ -531,19 +531,30 @@ func (s *Autoscaler) createCallbackTaskWithToken(ctx context.Context, url string
531531
defer client.Close()
532532
_, err := client.CreateTask(ctx, req)
533533
if err != nil {
534-
return fmt.Errorf("cloudtasks.CreateTask failed for job Id %d: %v", job.Id, err)
534+
// parse error so we can workaround de-duplication
535+
if match, _ := regexp.MatchString("code = AlreadyExists", err.Error()); match {
536+
req.Task.Name = fmt.Sprintf("%s/tasks/%d-1", s.conf.TaskQueue, job.Id)
537+
_, err := client.CreateTask(ctx, req)
538+
if err != nil {
539+
return fmt.Errorf("cloudtasks.CreateTask finally failed for job Id %d: %v", job.Id, err)
540+
} else {
541+
log.Infof("Finally created cloud task callback for workflow job Id %d with url \"%s\" and payload \"%s\"", job.Id, url, data)
542+
}
543+
} else {
544+
return fmt.Errorf("cloudtasks.CreateTask failed for job Id %d: %v", job.Id, err)
545+
}
535546
} else {
536547
log.Infof("Created cloud task callback for workflow job Id %d with url \"%s\" and payload \"%s\"", job.Id, url, data)
537548
}
538549
return nil
539550
}
540551

541-
func (s *Autoscaler) deleteCallbackTask(ctx context.Context, job Job) error {
552+
func (s *Autoscaler) DeleteCallbackTask(ctx context.Context, job Job) error {
542553

543554
client := newTaskClient(ctx)
544555
defer client.Close()
545556
err := client.DeleteTask(ctx, &taskspb.DeleteTaskRequest{
546-
Name: fmt.Sprintf("%s/tasks/%d", s.conf.TaskQueue, job.Id),
557+
Name: fmt.Sprintf("%s/tasks/%d-0", s.conf.TaskQueue, job.Id),
547558
})
548559
if err != nil {
549560
return fmt.Errorf("cloudtasks.DeleteTask failed for job Id %d: %v", job.Id, err)
@@ -670,7 +681,7 @@ func (s *Autoscaler) handleWebhook(ctx *gin.Context) {
670681
if ok, missingLabels := payload.Job.HasAllLabels(s.conf.RunnerLabels); ok {
671682
createUrl := createCallbackUrl(ctx, s.conf.RouteCreateVm, s.conf.SourceQueryParam, src.Name)
672683
// delay the create vm callback so we have a chance to delete it if the workflow job is changing its state to 'waiting'
673-
if err := s.createCallbackTaskWithToken(ctx, createUrl, src.Secret, payload.Job, time.Duration(s.conf.CreateVmDelay)*time.Second); err != nil {
684+
if err := s.CreateCallbackTaskWithToken(ctx, createUrl, src.Secret, payload.Job, time.Duration(s.conf.CreateVmDelay)*time.Second); err != nil {
674685
log.Errorf("Can not enqueue create-vm cloud task callback: %s", err.Error())
675686
ctx.AbortWithError(http.StatusInternalServerError, err)
676687
return
@@ -681,7 +692,7 @@ func (s *Autoscaler) handleWebhook(ctx *gin.Context) {
681692
} else if payload.Action == WAITING {
682693
// the waiting action happens if a deployment environment is configured in the workflow that requires a review. We have to cancel the cloud task callback
683694
if ok, missingLabels := payload.Job.HasAllLabels(s.conf.RunnerLabels); ok {
684-
if err := s.deleteCallbackTask(ctx, payload.Job); err != nil {
695+
if err := s.DeleteCallbackTask(ctx, payload.Job); err != nil {
685696
// best effort - this is not considered an error
686697
log.Warnf("Can not delete create-vm cloud task callback: %s", err.Error())
687698
}
@@ -697,10 +708,10 @@ func (s *Autoscaler) handleWebhook(ctx *gin.Context) {
697708
if ok, missingLabels := payload.Job.HasAllLabels(s.conf.RunnerLabels); ok {
698709

699710
// if the user immediately cancels a workflow we have the chance to delete the callback if not older than 10 seconds - best effort, ignore all errors
700-
s.deleteCallbackTask(ctx, payload.Job)
711+
s.DeleteCallbackTask(ctx, payload.Job)
701712

702713
deleteUrl := createCallbackUrl(ctx, s.conf.RouteDeleteVm, s.conf.SourceQueryParam, src.Name)
703-
if err := s.createCallbackTaskWithToken(ctx, deleteUrl, src.Secret, payload.Job, 1*time.Second); err != nil {
714+
if err := s.CreateCallbackTaskWithToken(ctx, deleteUrl, src.Secret, payload.Job, 1*time.Second); err != nil {
704715
log.Errorf("Can not enqueue delete-vm cloud task callback: %s", err.Error())
705716
ctx.AbortWithError(http.StatusInternalServerError, err)
706717
return

runner-autoscaler/test/main_test.go

+14-10
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ package test
33
import (
44
"context"
55
"fmt"
6+
"math"
7+
"math/rand"
68
"net/http"
79
"net/url"
810
"strings"
@@ -64,16 +66,6 @@ func TestWebhookSignature(t *testing.T) {
6466
assert.Equal(t, 200, resp.StatusCode)
6567
}
6668

67-
/*
68-
func TestGenerateRunnerRegistrationToken(t *testing.T) {
69-
70-
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
71-
defer cancel()
72-
token, err := scaler.GenerateRunnerRegistrationToken(ctx)
73-
assert.Nil(t, err)
74-
assert.NotEmpty(t, token)
75-
}*/
76-
7769
func TestGenerateRunnerJitConfig(t *testing.T) {
7870

7971
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
@@ -93,6 +85,18 @@ func TestGetMagicLabelValue(t *testing.T) {
9385
assert.Equal(t, "test", *result)
9486
}
9587

88+
func TestCreateCallbackTask(t *testing.T) {
89+
90+
job := pkg.Job{
91+
Id: rand.Int63n(math.MaxInt64),
92+
Labels: []string{"test", "@foo:bar", "@machine:test"},
93+
}
94+
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
95+
defer cancel()
96+
err := scaler.DeleteCallbackTask(ctx, job)
97+
assert.Nil(t, err)
98+
}
99+
96100
func TestHasAllLabels(t *testing.T) {
97101

98102
job := pkg.Job{

0 commit comments

Comments
 (0)