Skip to content

Commit e40fe2b

Browse files
author
Benjamin Ash
committed
feat: add support for ASG instance refresh for workers
Signed-off-by: Benjamin Ash <[email protected]>
1 parent 781f673 commit e40fe2b

9 files changed

+389
-4
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ eks-admin-cluster-role-binding.yaml
22
eks-admin-service-account.yaml
33
config-map-aws-auth*.yaml
44
kubeconfig_*
5+
.idea
56

67
#################################################################
78
# Default .gitignore content for all terraform-aws-modules below

README.md

+5-2
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@ By default, this module manages the `aws-auth` configmap for you (`manage_aws_au
3131

3232
For windows users, please read the following [doc](https://github.com/terraform-aws-modules/terraform-aws-eks/blob/master/docs/faq.md#deploying-from-windows-binsh-file-does-not-exist).
3333

34+
Setting `instance_refresh_enabled` to true will recreate your worker nodes without draining them first. It is recommended to install [aws-node-termination-handler](https://github.com/aws/aws-node-termination-handler) for proper node draining. Find the complete example here [instance_refresh](examples/instance_refresh).
35+
36+
3437
## Usage example
3538

3639
A full example leveraging other community modules is contained in the [examples/basic directory](https://github.com/terraform-aws-modules/terraform-aws-eks/tree/master/examples/basic).
@@ -145,7 +148,7 @@ MIT Licensed. See [LICENSE](https://github.com/terraform-aws-modules/terraform-a
145148
| Name | Version |
146149
|------|---------|
147150
| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 0.13.1 |
148-
| <a name="requirement_aws"></a> [aws](#requirement\_aws) | >= 3.35.0 |
151+
| <a name="requirement_aws"></a> [aws](#requirement\_aws) | >= 3.37.0 |
149152
| <a name="requirement_kubernetes"></a> [kubernetes](#requirement\_kubernetes) | >= 1.11.1 |
150153
| <a name="requirement_local"></a> [local](#requirement\_local) | >= 1.4 |
151154
| <a name="requirement_null"></a> [null](#requirement\_null) | >= 2.1 |
@@ -156,7 +159,7 @@ MIT Licensed. See [LICENSE](https://github.com/terraform-aws-modules/terraform-a
156159

157160
| Name | Version |
158161
|------|---------|
159-
| <a name="provider_aws"></a> [aws](#provider\_aws) | >= 3.35.0 |
162+
| <a name="provider_aws"></a> [aws](#provider\_aws) | >= 3.37.0 |
160163
| <a name="provider_kubernetes"></a> [kubernetes](#provider\_kubernetes) | >= 1.11.1 |
161164
| <a name="provider_local"></a> [local](#provider\_local) | >= 1.4 |
162165
| <a name="provider_null"></a> [null](#provider\_null) | >= 2.1 |

examples/instance_refresh/main.tf

+259
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,259 @@
1+
provider "aws" {
2+
region = var.region
3+
}
4+
data "aws_caller_identity" "current" {}
5+
6+
data "aws_eks_cluster" "cluster" {
7+
name = module.eks.cluster_id
8+
}
9+
10+
data "aws_eks_cluster_auth" "cluster" {
11+
name = module.eks.cluster_id
12+
}
13+
14+
provider "kubernetes" {
15+
host = data.aws_eks_cluster.cluster.endpoint
16+
cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority.0.data)
17+
token = data.aws_eks_cluster_auth.cluster.token
18+
load_config_file = false
19+
}
20+
21+
data "aws_availability_zones" "available" {
22+
}
23+
24+
locals {
25+
cluster_name = "test-refresh-${random_string.suffix.result}"
26+
}
27+
28+
resource "random_string" "suffix" {
29+
length = 8
30+
special = false
31+
}
32+
33+
module "vpc" {
34+
source = "terraform-aws-modules/vpc/aws"
35+
version = "~> 3.0.0"
36+
37+
name = local.cluster_name
38+
cidr = "10.0.0.0/16"
39+
azs = data.aws_availability_zones.available.names
40+
public_subnets = ["10.0.4.0/24", "10.0.5.0/24", "10.0.6.0/24"]
41+
enable_dns_hostnames = true
42+
}
43+
44+
data "aws_iam_policy_document" "node_term" {
45+
statement {
46+
effect = "Allow"
47+
actions = [
48+
"ec2:DescribeInstances",
49+
"autoscaling:DescribeAutoScalingInstances",
50+
"autoscaling:DescribeTags",
51+
]
52+
resources = [
53+
"*",
54+
]
55+
}
56+
statement {
57+
effect = "Allow"
58+
actions = [
59+
"autoscaling:CompleteLifecycleAction",
60+
]
61+
resources = [
62+
module.eks.workers_asg_arns[0],
63+
]
64+
}
65+
statement {
66+
effect = "Allow"
67+
actions = [
68+
"sqs:DeleteMessage",
69+
"sqs:ReceiveMessage"
70+
]
71+
resources = [
72+
module.node_term_sqs.sqs_queue_arn
73+
]
74+
}
75+
}
76+
77+
resource "aws_iam_policy" "node_term" {
78+
name = "node-term-${local.cluster_name}"
79+
policy = data.aws_iam_policy_document.node_term.json
80+
}
81+
82+
resource "aws_iam_role_policy_attachment" "node_term_policy" {
83+
policy_arn = aws_iam_policy.node_term.arn
84+
role = module.eks.worker_iam_role_name
85+
}
86+
87+
data "aws_iam_policy_document" "node_term_events" {
88+
statement {
89+
effect = "Allow"
90+
principals {
91+
type = "Service"
92+
identifiers = [
93+
"events.amazonaws.com",
94+
"sqs.amazonaws.com",
95+
]
96+
}
97+
actions = [
98+
"sqs:SendMessage",
99+
]
100+
resources = [
101+
"arn:aws:sqs:${var.region}:${data.aws_caller_identity.current.account_id}:${local.cluster_name}",
102+
]
103+
}
104+
}
105+
106+
module "node_term_sqs" {
107+
source = "terraform-aws-modules/sqs/aws"
108+
version = "~> 3.0.0"
109+
name = local.cluster_name
110+
message_retention_seconds = 300
111+
policy = data.aws_iam_policy_document.node_term_events.json
112+
}
113+
114+
resource "aws_cloudwatch_event_rule" "node_term_event_rule" {
115+
name = "${local.cluster_name}-nth-rule"
116+
description = "Node termination event rule"
117+
event_pattern = jsonencode(
118+
{
119+
"source" : [
120+
"aws.autoscaling"
121+
],
122+
"detail-type" : [
123+
"EC2 Instance-terminate Lifecycle Action"
124+
]
125+
"resources" : [
126+
module.eks.workers_asg_arns[0],
127+
]
128+
}
129+
)
130+
}
131+
132+
resource "aws_cloudwatch_event_target" "node_term_event_target" {
133+
rule = aws_cloudwatch_event_rule.node_term_event_rule.name
134+
target_id = "ANTHandler"
135+
arn = module.node_term_sqs.sqs_queue_arn
136+
}
137+
138+
module "node_term_role" {
139+
source = "terraform-aws-modules/iam/aws//modules/iam-assumable-role-with-oidc"
140+
version = "4.1.0"
141+
create_role = true
142+
role_description = "IRSA role for ANTH, cluster ${local.cluster_name}"
143+
role_name_prefix = local.cluster_name
144+
provider_url = replace(module.eks.cluster_oidc_issuer_url, "https://", "")
145+
role_policy_arns = [aws_iam_policy.node_term.arn]
146+
oidc_fully_qualified_subjects = ["system:serviceaccount:${var.namespace}:${var.serviceaccount}"]
147+
}
148+
149+
resource "null_resource" "helm_install" {
150+
provisioner "local-exec" {
151+
environment = {
152+
USE_SUDO = "false"
153+
HELM_INSTALL_DIR = "${path.root}/bin"
154+
}
155+
command = <<EOT
156+
mkdir -p ${path.root}/bin
157+
curl -fsSL -o ${path.root}/bin/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
158+
chmod +x ${path.root}/bin/get_helm.sh
159+
${path.root}/bin/get_helm.sh
160+
EOT
161+
}
162+
}
163+
164+
resource "null_resource" "helm_add_repo" {
165+
depends_on = [
166+
null_resource.helm_install,
167+
]
168+
169+
triggers = {
170+
version = var.aws_node_termination_handler_chart_version
171+
}
172+
173+
provisioner "local-exec" {
174+
environment = {
175+
XDG_CACHE_HOME = abspath("${path.root}/.cache")
176+
XDG_CONFIG_HOME = abspath(path.root)
177+
XDG_DATA_HOME = abspath(path.root)
178+
}
179+
command = "${path.root}/bin/helm repo add --force-update eks https://aws.github.io/eks-charts"
180+
}
181+
}
182+
183+
resource "null_resource" "helm_install_chart" {
184+
depends_on = [
185+
module.eks,
186+
module.node_term_role,
187+
null_resource.helm_add_repo,
188+
]
189+
190+
triggers = {
191+
version = var.aws_node_termination_handler_chart_version
192+
role_arn = module.node_term_role.iam_role_arn
193+
}
194+
195+
provisioner "local-exec" {
196+
environment = {
197+
KUBECONFIG = "${path.root}/${module.eks.kubeconfig_filename}"
198+
XDG_CACHE_HOME = abspath("${path.root}/.cache")
199+
XDG_CONFIG_HOME = abspath(path.root)
200+
XDG_DATA_HOME = abspath(path.root)
201+
}
202+
command = <<EOT
203+
chmod 0600 "$KUBECONFIG"
204+
${path.root}/bin/helm upgrade --install aws-node-termination-handler \
205+
--wait \
206+
--namespace ${var.namespace} \
207+
--set serviceAccount.name=${var.serviceaccount} \
208+
--set serviceAccount.annotations."eks\.amazonaws\.com/role-arn"=${self.triggers.role_arn} \
209+
--set enableSqsTerminationDraining=true \
210+
--set queueURL=${module.node_term_sqs.sqs_queue_id} \
211+
--set logLevel=DEBUG \
212+
--set awsRegion=${var.region} \
213+
--version=${self.triggers.version} \
214+
eks/aws-node-termination-handler
215+
EOT
216+
}
217+
}
218+
219+
resource "aws_autoscaling_lifecycle_hook" "node_term" {
220+
name = "node_term-${local.cluster_name}"
221+
autoscaling_group_name = module.eks.workers_asg_names[0]
222+
lifecycle_transition = "autoscaling:EC2_INSTANCE_TERMINATING"
223+
heartbeat_timeout = 300
224+
default_result = "CONTINUE"
225+
}
226+
227+
module "eks" {
228+
source = "../.."
229+
cluster_name = local.cluster_name
230+
cluster_version = "1.19"
231+
subnets = module.vpc.public_subnets
232+
vpc_id = module.vpc.vpc_id
233+
enable_irsa = true
234+
worker_create_initial_lifecycle_hooks = true
235+
worker_groups_launch_template = [
236+
{
237+
name = "refresh"
238+
asg_max_size = 2
239+
asg_desired_capacity = 2
240+
instance_refresh_enabled = true
241+
instance_refresh_triggers = ["tag"]
242+
public_ip = true
243+
metadata_http_put_response_hop_limit = 3
244+
tags = [
245+
{
246+
key = "aws-node-termination-handler/managed"
247+
value = ""
248+
propagate_at_launch = true
249+
},
250+
{
251+
key = "foo"
252+
value = "buzz"
253+
propagate_at_launch = true
254+
},
255+
]
256+
},
257+
]
258+
}
259+

examples/instance_refresh/outputs.tf

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
output "cluster_endpoint" {
2+
description = "Endpoint for EKS control plane."
3+
value = module.eks.cluster_endpoint
4+
}
5+
6+
output "cluster_security_group_id" {
7+
description = "Security group ids attached to the cluster control plane."
8+
value = module.eks.cluster_security_group_id
9+
}
10+
11+
output "kubectl_config" {
12+
description = "kubectl config as generated by the module."
13+
value = module.eks.kubeconfig
14+
}
15+
16+
output "config_map_aws_auth" {
17+
description = "A kubernetes configuration to authenticate to this EKS cluster."
18+
value = module.eks.config_map_aws_auth
19+
}
20+
21+
output "region" {
22+
description = "AWS region."
23+
value = var.region
24+
}
25+
26+
output "sqs_queue_asg_notification_arn" {
27+
description = "SQS queue ASG notification ARN"
28+
value = module.node_term_sqs.sqs_queue_arn
29+
}
30+
31+
output "sqs_queue_asg_notification_url" {
32+
description = "SQS queue ASG notification URL"
33+
value = module.node_term_sqs.sqs_queue_id
34+
}
+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
variable "region" {
2+
default = "us-west-2"
3+
}
4+
5+
variable "aws_node_termination_handler_chart_version" {
6+
description = "Version of the aws-node-termination-handler Helm chart to install."
7+
default = "0.15.0"
8+
}
9+
10+
variable "namespace" {
11+
description = "Namespace for the aws-node-termination-handler."
12+
default = "kube-system"
13+
}
14+
15+
variable "serviceaccount" {
16+
description = "Serviceaccount for the aws-node-termination-handler."
17+
default = "aws-node-termination-handler"
18+
}

examples/instance_refresh/versions.tf

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
terraform {
2+
required_version = ">= 0.13.1"
3+
4+
required_providers {
5+
aws = ">= 3.22.0"
6+
local = ">= 1.4"
7+
null = ">= 2.1"
8+
template = ">= 2.1"
9+
random = ">= 2.1"
10+
kubernetes = "~> 1.11"
11+
}
12+
}

local.tf

+6-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ locals {
3434
asg_max_size = "3" # Maximum worker capacity in the autoscaling group.
3535
asg_min_size = "1" # Minimum worker capacity in the autoscaling group. NOTE: Change in this paramater will affect the asg_desired_capacity, like changing its value to 2 will change asg_desired_capacity value to 2 but bringing back it to 1 will not affect the asg_desired_capacity.
3636
asg_force_delete = false # Enable forced deletion for the autoscaling group.
37-
asg_initial_lifecycle_hooks = [] # Initital lifecycle hook for the autoscaling group.
37+
asg_initial_lifecycle_hooks = [] # Initial lifecycle hook for the autoscaling group.
3838
asg_recreate_on_change = false # Recreate the autoscaling group when the Launch Template or Launch Configuration change.
3939
default_cooldown = null # The amount of time, in seconds, after a scaling activity completes before another scaling activity can start.
4040
health_check_type = null # Controls how health checking is done. Valid values are "EC2" or "ELB".
@@ -96,6 +96,11 @@ locals {
9696
spot_max_price = "" # Maximum price per unit hour that the user is willing to pay for the Spot instances. Default is the on-demand price
9797
max_instance_lifetime = 0 # Maximum number of seconds instances can run in the ASG. 0 is unlimited.
9898
elastic_inference_accelerator = null # Type of elastic inference accelerator to be attached. Example values are eia1.medium, eia2.large, etc.
99+
instance_refresh_enabled = false # Enable instance refresh for the worker autoscaling group.
100+
instance_refresh_strategy = "Rolling" # Strategy to use for instance refresh. Default is 'Rolling' which the only valid value.
101+
instance_refresh_min_healthy_percentage = 90 # The amount of capacity in the ASG that must remain healthy during an instance refresh, as a percentage of the ASG's desired capacity.
102+
instance_refresh_instance_warmup = null # The number of seconds until a newly launched instance is configured and ready to use. Defaults to the ASG's health check grace period.
103+
instance_refresh_triggers = [] # Set of additional property names that will trigger an Instance Refresh. A refresh will always be triggered by a change in any of launch_configuration, launch_template, or mixed_instances_policy.
99104
}
100105

101106
workers_group_defaults = merge(

0 commit comments

Comments
 (0)