Skip to content

Commit 484b484

Browse files
bashimsbarryib
authored andcommitted
feat: Add support for Auto Scaling Group Instance Refresh for self-managed worker groups (terraform-aws-modules#1224)
Co-authored-by: Thierno IB. BARRY <[email protected]>
1 parent eea966d commit 484b484

9 files changed

+360
-5
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ eks-admin-cluster-role-binding.yaml
22
eks-admin-service-account.yaml
33
config-map-aws-auth*.yaml
44
kubeconfig_*
5+
.idea
56

67
#################################################################
78
# Default .gitignore content for all terraform-aws-modules below

README.md

+2-3
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ An example of harming update was the removal of several commonly used, but depre
2727

2828
By default, this module manages the `aws-auth` configmap for you (`manage_aws_auth=true`). To avoid the following [issue](https://github.com/aws/containers-roadmap/issues/654) where the EKS creation is `ACTIVE` but not ready. We implemented a "retry" logic with a fork of the http provider https://github.com/terraform-aws-modules/terraform-provider-http. This fork adds the support of a self-signed CA certificate. The original PR can be found at https://github.com/hashicorp/terraform-provider-http/pull/29.
2929

30+
Setting `instance_refresh_enabled` to true will recreate your worker nodes without draining them first. It is recommended to install [aws-node-termination-handler](https://github.com/aws/aws-node-termination-handler) for proper node draining. Find the complete example here [instance_refresh](examples/instance_refresh).
31+
3032
## Usage example
3133

3234
A full example leveraging other community modules is contained in the [examples/basic directory](https://github.com/terraform-aws-modules/terraform-aws-eks/tree/master/examples/basic).
@@ -155,7 +157,6 @@ MIT Licensed. See [LICENSE](https://github.com/terraform-aws-modules/terraform-a
155157
| <a name="provider_http"></a> [http](#provider\_http) | >= 2.3.0 |
156158
| <a name="provider_kubernetes"></a> [kubernetes](#provider\_kubernetes) | >= 1.11.1 |
157159
| <a name="provider_local"></a> [local](#provider\_local) | >= 1.4 |
158-
| <a name="provider_random"></a> [random](#provider\_random) | >= 2.1 |
159160

160161
## Modules
161162

@@ -202,8 +203,6 @@ MIT Licensed. See [LICENSE](https://github.com/terraform-aws-modules/terraform-a
202203
| [aws_security_group_rule.workers_ingress_self](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/security_group_rule) | resource |
203204
| [kubernetes_config_map.aws_auth](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/config_map) | resource |
204205
| [local_file.kubeconfig](https://registry.terraform.io/providers/hashicorp/local/latest/docs/resources/file) | resource |
205-
| [random_pet.workers](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/pet) | resource |
206-
| [random_pet.workers_launch_template](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/pet) | resource |
207206
| [aws_ami.eks_worker](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ami) | data source |
208207
| [aws_ami.eks_worker_windows](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ami) | data source |
209208
| [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source |

examples/instance_refresh/main.tf

+234
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,234 @@
1+
provider "aws" {
2+
region = var.region
3+
}
4+
5+
data "aws_caller_identity" "current" {}
6+
7+
data "aws_eks_cluster" "cluster" {
8+
name = module.eks.cluster_id
9+
}
10+
11+
data "aws_eks_cluster_auth" "cluster" {
12+
name = module.eks.cluster_id
13+
}
14+
15+
provider "kubernetes" {
16+
host = data.aws_eks_cluster.cluster.endpoint
17+
cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority.0.data)
18+
token = data.aws_eks_cluster_auth.cluster.token
19+
load_config_file = false
20+
}
21+
22+
provider "helm" {
23+
kubernetes {
24+
host = data.aws_eks_cluster.cluster.endpoint
25+
cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority.0.data)
26+
token = data.aws_eks_cluster_auth.cluster.token
27+
}
28+
}
29+
30+
data "aws_availability_zones" "available" {
31+
}
32+
33+
locals {
34+
cluster_name = "test-refresh-${random_string.suffix.result}"
35+
}
36+
37+
resource "random_string" "suffix" {
38+
length = 8
39+
special = false
40+
}
41+
42+
module "vpc" {
43+
source = "terraform-aws-modules/vpc/aws"
44+
version = "~> 3.0.0"
45+
46+
name = local.cluster_name
47+
cidr = "10.0.0.0/16"
48+
azs = data.aws_availability_zones.available.names
49+
public_subnets = ["10.0.4.0/24", "10.0.5.0/24", "10.0.6.0/24"]
50+
enable_dns_hostnames = true
51+
}
52+
53+
data "aws_iam_policy_document" "node_term" {
54+
statement {
55+
effect = "Allow"
56+
actions = [
57+
"ec2:DescribeInstances",
58+
"autoscaling:DescribeAutoScalingInstances",
59+
"autoscaling:DescribeTags",
60+
]
61+
resources = [
62+
"*",
63+
]
64+
}
65+
statement {
66+
effect = "Allow"
67+
actions = [
68+
"autoscaling:CompleteLifecycleAction",
69+
]
70+
resources = module.eks.workers_asg_arns
71+
}
72+
statement {
73+
effect = "Allow"
74+
actions = [
75+
"sqs:DeleteMessage",
76+
"sqs:ReceiveMessage"
77+
]
78+
resources = [
79+
module.node_term_sqs.sqs_queue_arn
80+
]
81+
}
82+
}
83+
84+
resource "aws_iam_policy" "node_term" {
85+
name = "node-term-${local.cluster_name}"
86+
policy = data.aws_iam_policy_document.node_term.json
87+
}
88+
89+
resource "aws_iam_role_policy_attachment" "node_term_policy" {
90+
policy_arn = aws_iam_policy.node_term.arn
91+
role = module.eks.worker_iam_role_name
92+
}
93+
94+
data "aws_iam_policy_document" "node_term_events" {
95+
statement {
96+
effect = "Allow"
97+
principals {
98+
type = "Service"
99+
identifiers = [
100+
"events.amazonaws.com",
101+
"sqs.amazonaws.com",
102+
]
103+
}
104+
actions = [
105+
"sqs:SendMessage",
106+
]
107+
resources = [
108+
"arn:aws:sqs:${var.region}:${data.aws_caller_identity.current.account_id}:${local.cluster_name}",
109+
]
110+
}
111+
}
112+
113+
module "node_term_sqs" {
114+
source = "terraform-aws-modules/sqs/aws"
115+
version = "~> 3.0.0"
116+
name = local.cluster_name
117+
message_retention_seconds = 300
118+
policy = data.aws_iam_policy_document.node_term_events.json
119+
}
120+
121+
resource "aws_cloudwatch_event_rule" "node_term_event_rule" {
122+
name = "${local.cluster_name}-nth-rule"
123+
description = "Node termination event rule"
124+
event_pattern = jsonencode(
125+
{
126+
"source" : [
127+
"aws.autoscaling"
128+
],
129+
"detail-type" : [
130+
"EC2 Instance-terminate Lifecycle Action"
131+
]
132+
"resources" : module.eks.workers_asg_arns
133+
}
134+
)
135+
}
136+
137+
resource "aws_cloudwatch_event_target" "node_term_event_target" {
138+
rule = aws_cloudwatch_event_rule.node_term_event_rule.name
139+
target_id = "ANTHandler"
140+
arn = module.node_term_sqs.sqs_queue_arn
141+
}
142+
143+
module "node_term_role" {
144+
source = "terraform-aws-modules/iam/aws//modules/iam-assumable-role-with-oidc"
145+
version = "4.1.0"
146+
create_role = true
147+
role_description = "IRSA role for ANTH, cluster ${local.cluster_name}"
148+
role_name_prefix = local.cluster_name
149+
provider_url = replace(module.eks.cluster_oidc_issuer_url, "https://", "")
150+
role_policy_arns = [aws_iam_policy.node_term.arn]
151+
oidc_fully_qualified_subjects = ["system:serviceaccount:${var.namespace}:${var.serviceaccount}"]
152+
}
153+
154+
resource "helm_release" "anth" {
155+
depends_on = [
156+
module.eks
157+
]
158+
159+
name = "aws-node-termination-handler"
160+
namespace = var.namespace
161+
repository = "https://aws.github.io/eks-charts"
162+
chart = "aws-node-termination-handler"
163+
version = var.aws_node_termination_handler_chart_version
164+
create_namespace = true
165+
166+
set {
167+
name = "awsRegion"
168+
value = var.region
169+
}
170+
set {
171+
name = "serviceAccount.name"
172+
value = var.serviceaccount
173+
}
174+
set {
175+
name = "serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn"
176+
value = module.node_term_role.iam_role_arn
177+
type = "string"
178+
}
179+
set {
180+
name = "enableSqsTerminationDraining"
181+
value = "true"
182+
}
183+
set {
184+
name = "queueURL"
185+
value = module.node_term_sqs.sqs_queue_id
186+
}
187+
set {
188+
name = "logLevel"
189+
value = "DEBUG"
190+
}
191+
}
192+
193+
# Creating the lifecycle-hook outside of the ASG resource's `initial_lifecycle_hook`
194+
# ensures that node termination does not require the lifecycle action to be completed,
195+
# and thus allows the ASG to be destroyed cleanly.
196+
resource "aws_autoscaling_lifecycle_hook" "node_term" {
197+
name = "node_term-${local.cluster_name}"
198+
autoscaling_group_name = module.eks.workers_asg_names[0]
199+
lifecycle_transition = "autoscaling:EC2_INSTANCE_TERMINATING"
200+
heartbeat_timeout = 300
201+
default_result = "CONTINUE"
202+
}
203+
204+
module "eks" {
205+
source = "../.."
206+
cluster_name = local.cluster_name
207+
cluster_version = "1.19"
208+
subnets = module.vpc.public_subnets
209+
vpc_id = module.vpc.vpc_id
210+
enable_irsa = true
211+
worker_groups_launch_template = [
212+
{
213+
name = "refresh"
214+
asg_max_size = 2
215+
asg_desired_capacity = 2
216+
instance_refresh_enabled = true
217+
instance_refresh_triggers = ["tag"]
218+
public_ip = true
219+
metadata_http_put_response_hop_limit = 3
220+
tags = [
221+
{
222+
key = "aws-node-termination-handler/managed"
223+
value = ""
224+
propagate_at_launch = true
225+
},
226+
{
227+
key = "foo"
228+
value = "buzz"
229+
propagate_at_launch = true
230+
},
231+
]
232+
},
233+
]
234+
}

examples/instance_refresh/outputs.tf

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
output "cluster_endpoint" {
2+
description = "Endpoint for EKS control plane."
3+
value = module.eks.cluster_endpoint
4+
}
5+
6+
output "cluster_security_group_id" {
7+
description = "Security group ids attached to the cluster control plane."
8+
value = module.eks.cluster_security_group_id
9+
}
10+
11+
output "kubectl_config" {
12+
description = "kubectl config as generated by the module."
13+
value = module.eks.kubeconfig
14+
}
15+
16+
output "config_map_aws_auth" {
17+
description = "A kubernetes configuration to authenticate to this EKS cluster."
18+
value = module.eks.config_map_aws_auth
19+
}
20+
21+
output "region" {
22+
description = "AWS region."
23+
value = var.region
24+
}
25+
26+
output "sqs_queue_asg_notification_arn" {
27+
description = "SQS queue ASG notification ARN"
28+
value = module.node_term_sqs.sqs_queue_arn
29+
}
30+
31+
output "sqs_queue_asg_notification_url" {
32+
description = "SQS queue ASG notification URL"
33+
value = module.node_term_sqs.sqs_queue_id
34+
}
+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
variable "region" {
2+
default = "us-west-2"
3+
}
4+
5+
variable "aws_node_termination_handler_chart_version" {
6+
description = "Version of the aws-node-termination-handler Helm chart to install."
7+
default = "0.15.0"
8+
}
9+
10+
variable "namespace" {
11+
description = "Namespace for the aws-node-termination-handler."
12+
default = "kube-system"
13+
}
14+
15+
variable "serviceaccount" {
16+
description = "Serviceaccount for the aws-node-termination-handler."
17+
default = "aws-node-termination-handler"
18+
}

examples/instance_refresh/versions.tf

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
terraform {
2+
required_version = ">= 0.13.1"
3+
4+
required_providers {
5+
aws = ">= 3.22.0"
6+
local = ">= 1.4"
7+
random = ">= 2.1"
8+
kubernetes = "~> 1.11"
9+
helm = "~> 2.1.2"
10+
}
11+
}

local.tf

+6-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ locals {
3434
asg_max_size = "3" # Maximum worker capacity in the autoscaling group.
3535
asg_min_size = "1" # Minimum worker capacity in the autoscaling group. NOTE: Change in this paramater will affect the asg_desired_capacity, like changing its value to 2 will change asg_desired_capacity value to 2 but bringing back it to 1 will not affect the asg_desired_capacity.
3636
asg_force_delete = false # Enable forced deletion for the autoscaling group.
37-
asg_initial_lifecycle_hooks = [] # Initital lifecycle hook for the autoscaling group.
37+
asg_initial_lifecycle_hooks = [] # Initial lifecycle hook for the autoscaling group.
3838
default_cooldown = null # The amount of time, in seconds, after a scaling activity completes before another scaling activity can start.
3939
health_check_type = null # Controls how health checking is done. Valid values are "EC2" or "ELB".
4040
health_check_grace_period = null # Time in seconds after instance comes into service before checking health.
@@ -95,6 +95,11 @@ locals {
9595
spot_max_price = "" # Maximum price per unit hour that the user is willing to pay for the Spot instances. Default is the on-demand price
9696
max_instance_lifetime = 0 # Maximum number of seconds instances can run in the ASG. 0 is unlimited.
9797
elastic_inference_accelerator = null # Type of elastic inference accelerator to be attached. Example values are eia1.medium, eia2.large, etc.
98+
instance_refresh_enabled = false # Enable instance refresh for the worker autoscaling group.
99+
instance_refresh_strategy = "Rolling" # Strategy to use for instance refresh. Default is 'Rolling' which the only valid value.
100+
instance_refresh_min_healthy_percentage = 90 # The amount of capacity in the ASG that must remain healthy during an instance refresh, as a percentage of the ASG's desired capacity.
101+
instance_refresh_instance_warmup = null # The number of seconds until a newly launched instance is configured and ready to use. Defaults to the ASG's health check grace period.
102+
instance_refresh_triggers = [] # Set of additional property names that will trigger an Instance Refresh. A refresh will always be triggered by a change in any of launch_configuration, launch_template, or mixed_instances_policy.
98103
}
99104

100105
workers_group_defaults = merge(

workers.tf

+27
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,33 @@ resource "aws_autoscaling_group" "workers" {
162162
}
163163
}
164164

165+
# logic duplicated in workers_launch_template.tf
166+
dynamic "instance_refresh" {
167+
for_each = lookup(var.worker_groups[count.index],
168+
"instance_refresh_enabled",
169+
local.workers_group_defaults["instance_refresh_enabled"]) ? [1] : []
170+
content {
171+
strategy = lookup(
172+
var.worker_groups[count.index], "instance_refresh_strategy",
173+
local.workers_group_defaults["instance_refresh_strategy"]
174+
)
175+
preferences {
176+
instance_warmup = lookup(
177+
var.worker_groups[count.index], "instance_refresh_instance_warmup",
178+
local.workers_group_defaults["instance_refresh_instance_warmup"]
179+
)
180+
min_healthy_percentage = lookup(
181+
var.worker_groups[count.index], "instance_refresh_min_healthy_percentage",
182+
local.workers_group_defaults["instance_refresh_min_healthy_percentage"]
183+
)
184+
}
185+
triggers = lookup(
186+
var.worker_groups[count.index], "instance_refresh_triggers",
187+
local.workers_group_defaults["instance_refresh_triggers"]
188+
)
189+
}
190+
}
191+
165192
lifecycle {
166193
create_before_destroy = true
167194
ignore_changes = [desired_capacity]

0 commit comments

Comments
 (0)