Skip to content

Commit 05835f8

Browse files
author
Benjamin Ash
committed
feat: Add support for ASG instance refresh for workers
Signed-off-by: Benjamin Ash <[email protected]>
1 parent 32f70af commit 05835f8

9 files changed

+359
-5
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ eks-admin-cluster-role-binding.yaml
22
eks-admin-service-account.yaml
33
config-map-aws-auth*.yaml
44
kubeconfig_*
5+
.idea
56

67
#################################################################
78
# Default .gitignore content for all terraform-aws-modules below

README.md

+2-3
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ An example of harming update was the removal of several commonly used, but depre
2727

2828
By default, this module manages the `aws-auth` configmap for you (`manage_aws_auth=true`). To avoid the following [issue](https://github.com/aws/containers-roadmap/issues/654) where the EKS creation is `ACTIVE` but not ready. We implemented a "retry" logic with a fork of the http provider https://github.com/terraform-aws-modules/terraform-provider-http. This fork adds the support of a self-signed CA certificate. The original PR can be found at https://github.com/hashicorp/terraform-provider-http/pull/29.
2929

30+
Setting `instance_refresh_enabled` to true will recreate your worker nodes without draining them first. It is recommended to install [aws-node-termination-handler](https://github.com/aws/aws-node-termination-handler) for proper node draining. Find the complete example here [instance_refresh](examples/instance_refresh).
31+
3032
## Usage example
3133

3234
A full example leveraging other community modules is contained in the [examples/basic directory](https://github.com/terraform-aws-modules/terraform-aws-eks/tree/master/examples/basic).
@@ -155,7 +157,6 @@ MIT Licensed. See [LICENSE](https://github.com/terraform-aws-modules/terraform-a
155157
| <a name="provider_http"></a> [http](#provider\_http) | >= 2.3.0 |
156158
| <a name="provider_kubernetes"></a> [kubernetes](#provider\_kubernetes) | >= 1.11.1 |
157159
| <a name="provider_local"></a> [local](#provider\_local) | >= 1.4 |
158-
| <a name="provider_random"></a> [random](#provider\_random) | >= 2.1 |
159160

160161
## Modules
161162

@@ -202,8 +203,6 @@ MIT Licensed. See [LICENSE](https://github.com/terraform-aws-modules/terraform-a
202203
| [aws_security_group_rule.workers_ingress_self](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/security_group_rule) | resource |
203204
| [kubernetes_config_map.aws_auth](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/config_map) | resource |
204205
| [local_file.kubeconfig](https://registry.terraform.io/providers/hashicorp/local/latest/docs/resources/file) | resource |
205-
| [random_pet.workers](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/pet) | resource |
206-
| [random_pet.workers_launch_template](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/pet) | resource |
207206
| [aws_ami.eks_worker](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ami) | data source |
208207
| [aws_ami.eks_worker_windows](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ami) | data source |
209208
| [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source |

examples/instance_refresh/main.tf

+233
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,233 @@
1+
provider "aws" {
2+
region = var.region
3+
}
4+
5+
data "aws_caller_identity" "current" {}
6+
7+
data "aws_eks_cluster" "cluster" {
8+
name = module.eks.cluster_id
9+
}
10+
11+
data "aws_eks_cluster_auth" "cluster" {
12+
name = module.eks.cluster_id
13+
}
14+
15+
provider "kubernetes" {
16+
host = data.aws_eks_cluster.cluster.endpoint
17+
cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority.0.data)
18+
token = data.aws_eks_cluster_auth.cluster.token
19+
load_config_file = false
20+
}
21+
22+
provider "helm" {
23+
kubernetes {
24+
config_path = module.eks.kubeconfig_filename
25+
}
26+
}
27+
28+
data "aws_availability_zones" "available" {
29+
}
30+
31+
locals {
32+
cluster_name = "test-refresh-${random_string.suffix.result}"
33+
}
34+
35+
resource "random_string" "suffix" {
36+
length = 8
37+
special = false
38+
}
39+
40+
module "vpc" {
41+
source = "terraform-aws-modules/vpc/aws"
42+
version = "~> 3.0.0"
43+
44+
name = local.cluster_name
45+
cidr = "10.0.0.0/16"
46+
azs = data.aws_availability_zones.available.names
47+
public_subnets = ["10.0.4.0/24", "10.0.5.0/24", "10.0.6.0/24"]
48+
enable_dns_hostnames = true
49+
}
50+
51+
data "aws_iam_policy_document" "node_term" {
52+
statement {
53+
effect = "Allow"
54+
actions = [
55+
"ec2:DescribeInstances",
56+
"autoscaling:DescribeAutoScalingInstances",
57+
"autoscaling:DescribeTags",
58+
]
59+
resources = [
60+
"*",
61+
]
62+
}
63+
statement {
64+
effect = "Allow"
65+
actions = [
66+
"autoscaling:CompleteLifecycleAction",
67+
]
68+
resources = module.eks.workers_asg_arns
69+
}
70+
statement {
71+
effect = "Allow"
72+
actions = [
73+
"sqs:DeleteMessage",
74+
"sqs:ReceiveMessage"
75+
]
76+
resources = [
77+
module.node_term_sqs.sqs_queue_arn
78+
]
79+
}
80+
}
81+
82+
resource "aws_iam_policy" "node_term" {
83+
name = "node-term-${local.cluster_name}"
84+
policy = data.aws_iam_policy_document.node_term.json
85+
}
86+
87+
resource "aws_iam_role_policy_attachment" "node_term_policy" {
88+
policy_arn = aws_iam_policy.node_term.arn
89+
role = module.eks.worker_iam_role_name
90+
}
91+
92+
data "aws_iam_policy_document" "node_term_events" {
93+
statement {
94+
effect = "Allow"
95+
principals {
96+
type = "Service"
97+
identifiers = [
98+
"events.amazonaws.com",
99+
"sqs.amazonaws.com",
100+
]
101+
}
102+
actions = [
103+
"sqs:SendMessage",
104+
]
105+
resources = [
106+
"arn:aws:sqs:${var.region}:${data.aws_caller_identity.current.account_id}:${local.cluster_name}",
107+
]
108+
}
109+
}
110+
111+
module "node_term_sqs" {
112+
source = "terraform-aws-modules/sqs/aws"
113+
version = "~> 3.0.0"
114+
name = local.cluster_name
115+
message_retention_seconds = 300
116+
policy = data.aws_iam_policy_document.node_term_events.json
117+
}
118+
119+
resource "aws_cloudwatch_event_rule" "node_term_event_rule" {
120+
name = "${local.cluster_name}-nth-rule"
121+
description = "Node termination event rule"
122+
event_pattern = jsonencode(
123+
{
124+
"source" : [
125+
"aws.autoscaling"
126+
],
127+
"detail-type" : [
128+
"EC2 Instance-terminate Lifecycle Action"
129+
]
130+
"resources" : module.eks.workers_asg_arns
131+
}
132+
)
133+
}
134+
135+
resource "aws_cloudwatch_event_target" "node_term_event_target" {
136+
rule = aws_cloudwatch_event_rule.node_term_event_rule.name
137+
target_id = "ANTHandler"
138+
arn = module.node_term_sqs.sqs_queue_arn
139+
}
140+
141+
module "node_term_role" {
142+
source = "terraform-aws-modules/iam/aws//modules/iam-assumable-role-with-oidc"
143+
version = "4.1.0"
144+
create_role = true
145+
role_description = "IRSA role for ANTH, cluster ${local.cluster_name}"
146+
role_name_prefix = local.cluster_name
147+
provider_url = replace(module.eks.cluster_oidc_issuer_url, "https://", "")
148+
role_policy_arns = [aws_iam_policy.node_term.arn]
149+
oidc_fully_qualified_subjects = ["system:serviceaccount:${var.namespace}:${var.serviceaccount}"]
150+
}
151+
152+
resource "helm_release" "anth" {
153+
depends_on = [
154+
module.eks
155+
]
156+
157+
name = "aws-node-termination-handler"
158+
namespace = var.namespace
159+
repository = "https://aws.github.io/eks-charts"
160+
chart = "aws-node-termination-handler"
161+
version = var.aws_node_termination_handler_chart_version
162+
create_namespace = true
163+
164+
set {
165+
name = "awsRegion"
166+
value = var.region
167+
}
168+
set {
169+
name = "serviceAccount.name"
170+
value = var.serviceaccount
171+
}
172+
set {
173+
name = "serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn"
174+
value = module.node_term_role.iam_role_arn
175+
type = "string"
176+
}
177+
set {
178+
name = "enableSqsTerminationDraining"
179+
value = "true"
180+
}
181+
set {
182+
name = "queueURL"
183+
value = module.node_term_sqs.sqs_queue_id
184+
}
185+
set {
186+
name = "logLevel"
187+
value = "DEBUG"
188+
}
189+
}
190+
191+
// Creating the lifecycle-hook outside of the ASG resource's
192+
// `initial_lifecycle_hook` ensures that node termination does not require the
193+
// lifecycle action to be completed, and thus allows the ASG to be destroyed cleanly.
194+
resource "aws_autoscaling_lifecycle_hook" "node_term" {
195+
name = "node_term-${local.cluster_name}"
196+
autoscaling_group_name = module.eks.workers_asg_names[0]
197+
lifecycle_transition = "autoscaling:EC2_INSTANCE_TERMINATING"
198+
heartbeat_timeout = 300
199+
default_result = "CONTINUE"
200+
}
201+
202+
module "eks" {
203+
source = "../.."
204+
cluster_name = local.cluster_name
205+
cluster_version = "1.19"
206+
subnets = module.vpc.public_subnets
207+
vpc_id = module.vpc.vpc_id
208+
enable_irsa = true
209+
worker_groups_launch_template = [
210+
{
211+
name = "refresh"
212+
asg_max_size = 2
213+
asg_desired_capacity = 2
214+
instance_refresh_enabled = true
215+
instance_refresh_triggers = ["tag"]
216+
public_ip = true
217+
metadata_http_put_response_hop_limit = 3
218+
tags = [
219+
{
220+
key = "aws-node-termination-handler/managed"
221+
value = ""
222+
propagate_at_launch = true
223+
},
224+
{
225+
key = "foo"
226+
value = "buzz"
227+
propagate_at_launch = true
228+
},
229+
]
230+
},
231+
]
232+
}
233+

examples/instance_refresh/outputs.tf

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
output "cluster_endpoint" {
2+
description = "Endpoint for EKS control plane."
3+
value = module.eks.cluster_endpoint
4+
}
5+
6+
output "cluster_security_group_id" {
7+
description = "Security group ids attached to the cluster control plane."
8+
value = module.eks.cluster_security_group_id
9+
}
10+
11+
output "kubectl_config" {
12+
description = "kubectl config as generated by the module."
13+
value = module.eks.kubeconfig
14+
}
15+
16+
output "config_map_aws_auth" {
17+
description = "A kubernetes configuration to authenticate to this EKS cluster."
18+
value = module.eks.config_map_aws_auth
19+
}
20+
21+
output "region" {
22+
description = "AWS region."
23+
value = var.region
24+
}
25+
26+
output "sqs_queue_asg_notification_arn" {
27+
description = "SQS queue ASG notification ARN"
28+
value = module.node_term_sqs.sqs_queue_arn
29+
}
30+
31+
output "sqs_queue_asg_notification_url" {
32+
description = "SQS queue ASG notification URL"
33+
value = module.node_term_sqs.sqs_queue_id
34+
}
+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
variable "region" {
2+
default = "us-west-2"
3+
}
4+
5+
variable "aws_node_termination_handler_chart_version" {
6+
description = "Version of the aws-node-termination-handler Helm chart to install."
7+
default = "0.15.0"
8+
}
9+
10+
variable "namespace" {
11+
description = "Namespace for the aws-node-termination-handler."
12+
default = "kube-system"
13+
}
14+
15+
variable "serviceaccount" {
16+
description = "Serviceaccount for the aws-node-termination-handler."
17+
default = "aws-node-termination-handler"
18+
}

examples/instance_refresh/versions.tf

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
terraform {
2+
required_version = ">= 0.13.1"
3+
4+
required_providers {
5+
aws = ">= 3.22.0"
6+
local = ">= 1.4"
7+
random = ">= 2.1"
8+
kubernetes = "~> 1.11"
9+
helm = "~> 2.1.2"
10+
}
11+
}

local.tf

+6-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ locals {
3434
asg_max_size = "3" # Maximum worker capacity in the autoscaling group.
3535
asg_min_size = "1" # Minimum worker capacity in the autoscaling group. NOTE: Change in this paramater will affect the asg_desired_capacity, like changing its value to 2 will change asg_desired_capacity value to 2 but bringing back it to 1 will not affect the asg_desired_capacity.
3636
asg_force_delete = false # Enable forced deletion for the autoscaling group.
37-
asg_initial_lifecycle_hooks = [] # Initital lifecycle hook for the autoscaling group.
37+
asg_initial_lifecycle_hooks = [] # Initial lifecycle hook for the autoscaling group.
3838
default_cooldown = null # The amount of time, in seconds, after a scaling activity completes before another scaling activity can start.
3939
health_check_type = null # Controls how health checking is done. Valid values are "EC2" or "ELB".
4040
health_check_grace_period = null # Time in seconds after instance comes into service before checking health.
@@ -95,6 +95,11 @@ locals {
9595
spot_max_price = "" # Maximum price per unit hour that the user is willing to pay for the Spot instances. Default is the on-demand price
9696
max_instance_lifetime = 0 # Maximum number of seconds instances can run in the ASG. 0 is unlimited.
9797
elastic_inference_accelerator = null # Type of elastic inference accelerator to be attached. Example values are eia1.medium, eia2.large, etc.
98+
instance_refresh_enabled = false # Enable instance refresh for the worker autoscaling group.
99+
instance_refresh_strategy = "Rolling" # Strategy to use for instance refresh. Default is 'Rolling' which the only valid value.
100+
instance_refresh_min_healthy_percentage = 90 # The amount of capacity in the ASG that must remain healthy during an instance refresh, as a percentage of the ASG's desired capacity.
101+
instance_refresh_instance_warmup = null # The number of seconds until a newly launched instance is configured and ready to use. Defaults to the ASG's health check grace period.
102+
instance_refresh_triggers = [] # Set of additional property names that will trigger an Instance Refresh. A refresh will always be triggered by a change in any of launch_configuration, launch_template, or mixed_instances_policy.
98103
}
99104

100105
workers_group_defaults = merge(

workers.tf

+27
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,33 @@ resource "aws_autoscaling_group" "workers" {
162162
}
163163
}
164164

165+
# logic duplicated in workers_launch_template.tf
166+
dynamic "instance_refresh" {
167+
for_each = lookup(var.worker_groups[count.index],
168+
"instance_refresh_enabled",
169+
local.workers_group_defaults["instance_refresh_enabled"]) ? [1] : []
170+
content {
171+
strategy = lookup(
172+
var.worker_groups[count.index], "instance_refresh_strategy",
173+
local.workers_group_defaults["instance_refresh_strategy"]
174+
)
175+
preferences {
176+
instance_warmup = lookup(
177+
var.worker_groups[count.index], "instance_refresh_instance_warmup",
178+
local.workers_group_defaults["instance_refresh_instance_warmup"]
179+
)
180+
min_healthy_percentage = lookup(
181+
var.worker_groups[count.index], "instance_refresh_min_healthy_percentage",
182+
local.workers_group_defaults["instance_refresh_min_healthy_percentage"]
183+
)
184+
}
185+
triggers = lookup(
186+
var.worker_groups[count.index], "instance_refresh_triggers",
187+
local.workers_group_defaults["instance_refresh_triggers"]
188+
)
189+
}
190+
}
191+
165192
lifecycle {
166193
create_before_destroy = true
167194
ignore_changes = [desired_capacity]

0 commit comments

Comments
 (0)