k8s-cluster.yaml

# Deploy kubernetes via kubeadm.
# $ limactl start ./k8s.yaml
# $ limactl shell k8s kubectl

# TODO leverage {{.Dir}} to copy share files between guests
# It can be accessed from the host by exporting the kubeconfig file;
# the ports are already forwarded automatically by lima:
#
# $ export KUBECONFIG=$(limactl list k8s --format 'unix://{{.Dir}}/copied-from-guest/kubeconfig.yaml')
# $ kubectl get no
# NAME       STATUS   ROLES                  AGE   VERSION
# lima-k8s   Ready    control-plane,master   44s   v1.22.3

# This template requires Lima v0.20.0 or later.
images:
  # Try to use release-yyyyMMdd image if available. Note that release-yyyyMMdd will be removed after several months.
  - location: "https://cloud-images.ubuntu.com/releases/24.04/release-20250115/ubuntu-24.04-server-cloudimg-amd64.img"
    arch: "x86_64"
    digest: "sha256:28d2f9df3ac0d24440eaf6998507df3405142cf94a55e1f90802c78e43d2d9df"
  - location: "https://cloud-images.ubuntu.com/releases/24.04/release-20250115/ubuntu-24.04-server-cloudimg-arm64.img"
    arch: "aarch64"
    digest: "sha256:f11282a728ad42f8bfe0b646a6807674d79a019bfc229d80032345dd3228a2db"
  # Fallback to the latest release image.
  # Hint: run `limactl prune` to invalidate the cache
  - location: "https://cloud-images.ubuntu.com/releases/24.04/release/ubuntu-24.04-server-cloudimg-amd64.img"
    arch: "x86_64"
  - location: "https://cloud-images.ubuntu.com/releases/24.04/release/ubuntu-24.04-server-cloudimg-arm64.img"
    arch: "aarch64"

# Mounts
vmType: "vz" # only for macOS; Linux uses 'qemu'

# Resources to be passed along with create command
# cpus: 2
# memory: "8GiB"
# disk: "30GiB"

# Storage
mountType: "virtiofs"
mounts:
  - location: "~"
    writable: false
  - location: "~/Workspace/klima/.klima/k8scfg"
    mountPoint: "/k8scfg"
    writable: true

# Networking
networks:
  - lima: shared

# DNS
hostResolver:
  enabled: true

# Runtime containerd
containerd:
  system: true
  user: false

# system init
provision:
  # Add nodes to '/etc/hosts'
  - mode: system
    script: |
      #!/bin/bash
      set -eux -o pipefail

      export IP=$(ip -4 addr show lima0 | grep -oP '(?<=inet\s)\d+(\.\d+){3}')
      export HOSTNAME=$(hostname)
      if ! grep -q "$IP $HOSTNAME" /etc/hosts; then
        echo "$IP $HOSTNAME" | sudo tee -a /etc/hosts
      fi
  # If this is cp1, aka the FIRST control-plane node, then we need to
  # backup and remove the configuration files stored in /k8scfg
  - mode: system
    script: |
      #!/bin/bash
      set -eux -o pipefail
      # only run if k8s has not been initialized yet
      test -e /etc/kubernetes/admin.conf && exit 0
      # only run on the first control-plane node
      hostname | grep -q 'lima-cp1' || exit 0
      backup_dir="/k8scfg/backup"
      i=0
      while [ -d "${backup_dir}.${i}" ]; do
        i=$((i + 1))
      done
      mkdir "${backup_dir}.${i}"
      if [ -f /k8scfg/*.sh ]; then
        mv /k8scfg/*.sh "${backup_dir}.${i}/"
      fi
      if [ -f /k8scfg/*.ip ]; then
        mv /k8scfg/*.ip "${backup_dir}.${i}/"
      fi
      if [ -f /k8scfg/*.conf ]; then
        mv /k8scfg/*.conf "${backup_dir}.${i}/"
      fi
  # install and configure time sync server
  - mode: system
    script: |
      #!/bin/bash
      set -eux -o pipefail
      # only run if chronyd is not present on the system
      # chrony is required by the system to keep time in sync
      # NTP SERVER for the cluster
      hostname | grep -q 'lima-cp1' || exit 0
      systemctl is-active --quiet chronyd && exit 0
      apt-get update
      apt-get install -y chrony
      # config chrony
      # using makestep 1 -1 instead of makestep 1 3
      # see: https://github.com/SuperQ/chrony/blob/master/doc/faq.adoc#34-is-chronyd-allowed-to-step-the-system-clock
      cat <<EOF | sudo tee /etc/chrony/chrony.conf
      allow 192.168.105.0/24

      pool ntp.ubuntu.com        iburst maxsources 4
      pool time.apple.com        iburst maxsources 4
      pool time.nist.gov         iburst maxsources 4
      pool 1.ubuntu.pool.ntp.org iburst maxsources 1
      pool 2.ubuntu.pool.ntp.org iburst maxsources 2

      driftfile /var/lib/chrony/chrony.drift
      logdir /var/log/chrony
      makestep 1.0 -1 
      rtcsync
      EOF
      systemctl enable --now chrony.service
      systemctl restart chrony.service
    # install and configure time sync client
  - mode: system
    script: |
      #!/bin/bash
      set -eux -o pipefail
      # chrony is required by the system to keep time in sync
      # only run if chronyd is not present on the system
      systemctl is-active --quiet chronyd && exit 0
      # NTP CLIENTS
      hostname | grep -qv 'lima-cp1' || exit 0
      # make a time client
      apt-get update
      apt-get install -y chrony
      # config chrony https://wiki.archlinux.org/title/Chrony
      # using makestep 1 -1 instead of makestep 1 3
      # removed: rtcautotrim 30
      # removed: rtconutc
      # removed: rtcfile /var/lib/chrony/chrony.rtc
      # see: https://github.com/SuperQ/chrony/blob/master/doc/faq.adoc#34-is-chronyd-allowed-to-step-the-system-clock
      cat <<EOF | sudo tee /etc/chrony/chrony.conf
      server $(cat /k8scfg/lima-cp1.ip) iburst
      driftfile /var/lib/chrony/chrony.drift
      logdir /var/log/chrony
      makestep 1.0 -1
      rtcsync
      EOF
      systemctl enable --now chrony.service
      systemctl restart chrony.service

  ##  # See <https://kubernetes.io/docs/setup/production-environment/container-runtimes/>
  ##  - mode: system
  ##    script: |
  ##      #!/bin/bash
  ##      set -eux -o pipefail
  ##      systemctl is-active --quiet containerd && exit 0
  ##      curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg
  ##      echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list
  ##      apt-get update -y
  ##      apt-get install -y containerd.io
  ##      # generate default containerd config
  ##      containerd config default > config.toml
  ##      mv config.toml /etc/containerd/config.toml
  ##      # update settings required by k8s
  ##      sudo sed -i 's/^disabled_plugins \=/\#disabled_plugins \=/g' /etc/containerd/config.toml
  ##      sudo sed -i 's/SystemdCgroup \= false/\SystemdCgroup \= true/g' /etc/containerd/config.toml
  ##      # Configuring the systemd cgroup driver
  ##      systemctl enable --now containerd
  ##      systemctl restart containerd
  # See <https://kubernetes.io/docs/setup/production-environment/tools/kubeadm/install-kubeadm/>
  - mode: system
    script: |
      #!/bin/bash
      set -eux -o pipefail
      # only run if kubeadm is not present on the system
      command -v kubeadm >/dev/null 2>&1 && exit 0
      # Install and configure prerequisites
      cat <<EOF | sudo tee /etc/modules-load.d/containerd.conf
      overlay
      br_netfilter
      EOF
      modprobe overlay
      modprobe br_netfilter
      cat <<EOF | sudo tee /etc/sysctl.d/99-kubernetes-cri.conf
      net.bridge.bridge-nf-call-iptables  = 1
      net.ipv4.ip_forward                 = 1
      net.bridge.bridge-nf-call-ip6tables = 1
      EOF
      sysctl --system
      # Installing kubeadm, kubelet and kubectl
      export DEBIAN_FRONTEND=noninteractive
      apt-get update
      # lvm2 is required by ceph/rook
      apt-get install -y apt-transport-https ca-certificates curl lvm2
      # add kubernetes repo
      VERSION=$(curl -L -s https://dl.k8s.io/release/stable.txt | sed -e 's/v//' | cut -d'.' -f1-2)
      echo "deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v${VERSION}/deb/ /" | sudo tee /etc/apt/sources.list.d/kubernetes.list
      curl -fsSL https://pkgs.k8s.io/core:/stable:/v${VERSION}/deb/Release.key | sudo gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg
      apt-get update
      # cri-tools
      apt-get install -y cri-tools
      cat  <<EOF | sudo tee /etc/crictl.yaml
      runtime-endpoint: unix:///run/containerd/containerd.sock
      EOF
      # cni-plugins
      apt-get install -y kubernetes-cni
      rm -f /etc/cni/net.d/*.conf*
      apt-get install -y kubelet kubeadm kubectl && apt-mark hold kubelet kubeadm kubectl
      systemctl enable --now kubelet
  ##  # See <https://kubernetes.io/docs/setup/production-environment/container-runtimes/>
  - mode: system
    script: |
      #!/bin/bash
      set -eux -o pipefail
      grep 'SystemdCgroup = true' /etc/containerd/config.toml && exit 0

      # Do not limit the number of tasks that can be spawned by containerd
      mkdir -p /etc/systemd/system/containerd.service.d
      cat <<EOF >/etc/systemd/system/containerd.service.d/max-tasks.conf
      [Service]
      TasksMax=infinity
      EOF

      # Decreases the likelihood that containerd is killed due to memory
      # pressure.
      #
      # Please see the following link for more information about the
      # OOMScoreAdjust configuration property:
      # https://www.freedesktop.org/software/systemd/man/systemd.exec.html#OOMScoreAdjust=
      cat <<EOF >/etc/systemd/system/containerd.service.d/memory-pressure.conf
      [Service]
      OOMScoreAdjust=-999
      EOF

      # generate default containerd config
      containerd config default > config.toml
      mv config.toml /etc/containerd/config.toml

      # this is not populated in the default config.toml
      ## update settings recommended by k8s
      #sed -i 's/^disabled_plugins \=/\#disabled_plugins \=/g' /etc/containerd/config.toml

      # note: this is the top level runtime, currently, unknown if grpc is inherited from this
      # Configuring the systemd cgroup driver
      # append "SystemdCgroup = true" to /etc/containerd/config.toml
      # after "[plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.runc.options]"
      sed -i "/\[plugins\.'io\.containerd\.cri\.v1\.runtime'\.containerd\.runtimes\.runc\.options\]/a \ \ \ \ \ \ \ \ \ \ \ \ SystemdCgroup = true" /etc/containerd/config.toml

      # update sandbox_image
      #sudo sed -i 's/sandbox_image \= \"\"/sandbox_image \= \"$(kubeadm config images list | grep pause | sort -r | head -n1)\"/g' /etc/containerd/config.toml
      # inject stargz before '[cgroup]'
      #  [proxy_plugins]
      #    [proxy_plugins."stargz"]
      #      type = "snapshot"
      #      address = "/run/containerd-stargz-grpc/containerd-stargz-grpc.sock"
      sed -i '/\[cgroup\]/i [proxy_plugins]\n  [proxy_plugins."stargz"]\n    type = "snapshot"\n    address = "\/run\/containerd-stargz-grpc\/containerd-stargz-grpc.sock"' /etc/containerd/config.toml

      # Configuring the systemd cgroup driver and overriding the sandbox (pause) image
      # for gRpc
      sed -i '/\[plugins\.'io.containerd.grpc.v1.cri'.x509_key_pair_streaming\]/i \
              sandbox_image = "$(kubeadm config images list | grep pause | sort -r | head -n1)"\n\
              [plugins."io.containerd.grpc.v1.cri".containerd]\n\
              [plugins."io.containerd.grpc.v1.cri".containerd.runtimes]\n\
                [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]\n\
                runtime_type = "io.containerd.runc.v2"\n\
                [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]\n\
                  SystemdCgroup = true' /etc/containerd/config.toml

      systemctl daemon-reload
      systemctl restart containerd
  #  # See <https://kubernetes.io/docs/setup/production-environment/container-runtimes/>
  #  - mode: system
  #    script: |
  #      #!/bin/bash
  #      # version = 2
  #      #  [proxy_plugins]
  #      #    [proxy_plugins."stargz"]
  #      #      type = "snapshot"
  #      #      address = "/run/containerd-stargz-grpc/containerd-stargz-grpc.sock"
  #      set -eux -o pipefail
  #      grep SystemdCgroup /etc/containerd/config.toml && exit 0
  #      grep "version = 2" /etc/containerd/config.toml || exit 1
  #      # Configuring the systemd cgroup driver
  #      ### Overriding the sandbox (pause) image
  #      cat <<EOF >>/etc/containerd/config.toml
  #        [plugins]
  #          [plugins."io.containerd.grpc.v1.cri"]
  #            sandbox_image = "$(kubeadm config images list | grep pause | sort -r | head -n1)"
  #            [plugins."io.containerd.grpc.v1.cri".containerd]
  #              [plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
  #                [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
  #                  runtime_type = "io.containerd.runc.v2"
  #                  [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
  #                    SystemdCgroup = true
  #      EOF
  #      systemctl restart containerd
  - mode: system
    script: |
      #!/bin/bash
      set -eux -o pipefail
      # DO NOT run if k8s has been initialized
      test -e /etc/kubernetes/admin.conf && exit 0
      export KUBECONFIG=/etc/kubernetes/admin.conf
      systemctl stop kubelet
      kubeadm config images list
      kubeadm config images pull --cri-socket=unix:///run/containerd/containerd.sock
      systemctl start kubelet
  - mode: system
    script: |
      #!/bin/bash
      set -eux -o pipefail
      echo $(ip -4 addr show lima0 | grep -oP '(?<=inet\s)\d+(\.\d+){3}') > /k8scfg/$(hostname).ip
  - mode: system
    script: |
      #!/bin/bash
      set -eux -o pipefail
      # only run on control-plane node
      hostname | grep -q 'lima-cp1' || exit 0
      # DO NOT run. if admin.conf exists we ASSUME that k8s has been initialized
      test -e /etc/kubernetes/admin.conf && exit 0
      export KUBECONFIG=/etc/kubernetes/admin.conf
      #    systemctl stop kubelet
      #    kubeadm config images list
      #    kubeadm config images pull --cri-socket=unix:///run/containerd/containerd.sock
      #    systemctl start kubelet
      # Initializing your control-plane node
      cat <<EOF >kubeadm-config.yaml
      kind: InitConfiguration
      apiVersion: kubeadm.k8s.io/v1beta4
      nodeRegistration:
        criSocket: unix:///run/containerd/containerd.sock
      ---
      kind: ClusterConfiguration
      apiVersion: kubeadm.k8s.io/v1beta4
      certificateValidityPeriod: 8760h # 1 year
      caCertificateValidityPeriod: 87600h # 10 years
      clusterName: kubernetes
      controlPlaneEndpoint: "$(ip -4 addr show lima0 | grep -oP '(?<=inet\s)\d+(\.\d+){3}'):6443"
      networking:
        podSubnet: "10.244.0.0/16" # --pod-network-cidr
      apiServer:
        certSANs: # --apiserver-cert-extra-sans
        - "127.0.0.1"
        - "$(ip -4 addr show lima0 | grep -oP '(?<=inet\s)\d+(\.\d+){3}')"
      #
      # The following network settings are required for prometheus to gather metrics
      #
        extraArgs:
        - name: "bind-address"
          value: "0.0.0.0"
      controllerManager:
        extraArgs:
        - name: "bind-address"
          value: "0.0.0.0"
      scheduler:
        extraArgs:
        - name: "bind-address"
          value: "0.0.0.0"
      etcd:
        local:
          extraArgs:
          - name: "listen-metrics-urls"
            value: "http://0.0.0.0:2381"  
      ---
      kind: KubeletConfiguration
      apiVersion: kubelet.config.k8s.io/v1beta1
      cgroupDriver: systemd
      EOF
      #!/bin/bash 
      kubeadm init --upload-certs --config kubeadm-config.yaml > /k8scfg/$(hostname)-up.log 2>&1
      # Installing a Pod network add-on
      kubectl apply -f https://github.com/flannel-io/flannel/releases/download/v0.26.2/kube-flannel.yml --validate=false
      # Control plane node isolation
      ##
      ## keep the control plane node isolated
      ## 
      ##kubectl taint nodes --all node-role.kubernetes.io/control-plane-
      ### Replace the server address with localhost, so that it works also from the host
      ### sed -e "/server:/ s|https://.*:\([0-9]*\)$|https://127.0.0.1:\1|" -i $KUBECONFIG
      mkdir -p ${HOME:-/root}/.kube && cp -f $KUBECONFIG ${HOME:-/root}/.kube/config
      sudo cp -f /etc/kubernetes/admin.conf /k8scfg/
  - mode: system
    script: |
      #!/bin/bash
      set -eux -o pipefail
      hostname | grep -q 'lima-cp1' || exit 0
      # generate kubeadm JoinConfiguration for control-plane node 
      # 
      # TODO join command for the control-plane node is busted.
      # 
      # [download-certs] Downloading the certificates in Secret "kubeadm-certs" in the "kube-system" Namespace
      # error execution phase control-plane-prepare/download-certs: error downloading certs: error decoding secret data with provided key: cipher: message authentication failed
      # To see the stack trace of this error execute with --v=5 or higher
      #
      # WARNING!! This is busted. Unable to create the join for control plances at this time
      #
      echo $(sudo kubeadm token create --print-join-command --certificate-key `sudo kubeadm certs certificate-key`) > /k8scfg/cp-join.sh 
      # generate kubeadm JoinConfiguration for worker node
      cat /k8scfg/cp-join.sh | awk '{for(i=1;i<=NF;i++) if($i=="--control-plane") {for(j=1;j<i;j++) printf $j " "; print ""; break}}' > /k8scfg/worker-join.sh
      chmod +x /k8scfg/cp-join.sh /k8scfg/worker-join.sh
  - mode: system
    script: |
      #!/bin/bash
      set -eux -o pipefail
      # Only run on worker nodes
      hostname | grep -q 'lima-n' || exit 0
      # join the cluster
      sudo /k8scfg/worker-join.sh > /k8scfg/$(hostname)-up.log 2>&1
  - mode: system
    script: |
      #!/bin/bash
      set -eux -o pipefail
      # DO NOT run if k8s has been initialized
      test -e /etc/kubernetes/admin.conf && exit 0
      # DO NOT run on the first control-plane node
      hostname | grep -qv 'lima-cp1' || exit 0
      # Only run on control-plane nodes
      hostname | grep -q 'lima-cp' || exit 0
      # join the cluster
      #
      # TODO join command for the control-plane node is busted. 
      sudo /k8scfg/cp-join.sh > /k8scfg/$(hostname)-up.log 2>&1
# Begin readiness probes
probes:
  - description: "chronyd to be running"
    script: |
      #!/bin/bash
      set -eux -o pipefail
      if ! timeout 30s bash -c "until systemctl is-active --quiet chronyd.service; do sleep 3; done"; then
        echo >&2 "chronyd is not running yet"
        exit 1
      fi
  - description: "kubeadm to be installed"
    script: |
      #!/bin/bash
      set -eux -o pipefail
      if ! timeout 30s bash -c "until command -v kubeadm >/dev/null 2>&1; do sleep 5; done"; then
        echo >&2 "kubeadm is not installed yet"
        exit 1
      fi
    hint: |
      See "/var/log/cloud-init-output.log" in the guest
  - description: "kubernetes images to be pulled"
    script: |
      #!/bin/bash
      set -eux -o pipefail
      if ! timeout 30s bash -c "images=\"$(kubeadm config images list)\"; until for image in \$images; do sudo crictl image -q \$image | grep -q sha256; done; do sleep 3; done"; then
        echo >&2 "k8s images are not pulled yet"
        exit 1
      fi
  - description: "admin.conf to be created"
    script: |
      #!/bin/bash
      set -eux -o pipefail
      if ! timeout 30s bash -c "until test -f /k8scfg/admin.conf; do sleep 3; done"; then
        echo >&2 "admin.conf is not created yet"
        exit 1
      fi
  - description: "join scripts to be created"
    script: |
      #!/bin/bash
      set -eux -o pipefail
      if ! timeout 30s bash -c "until test -f /k8scfg/cp-join.sh && test -f /k8scfg/worker-join.sh; do sleep 3; done"; then
        echo >&2 "join scripts are not created yet"
        exit 1
      fi
#copyToHost:
#- guest: "/etc/kubernetes/admin.conf"
#  host: "{{.Dir}}/copied-from-guest/kubeconfig.yaml"
#  deleteOnStop: true
message: |
  Kubernetes has been deployed. Services may take a few minutes to become available.

  Run 'kubectl get nodes' to verify
  ------