k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/experiment/maintenance/enable_soft_eviction.py (about)

     1  #!/usr/bin/env python3
     2  
     3  # Copyright 2017 The Kubernetes Authors.
     4  #
     5  # Licensed under the Apache License, Version 2.0 (the "License");
     6  # you may not use this file except in compliance with the License.
     7  # You may obtain a copy of the License at
     8  #
     9  #     http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  
    17  # This script hijacks the COS kubelet service definition to set soft eviction thresholds
    18  # for nodefs on the prow builds cluster
    19  # USE AT YOUR OWN RISK.
    20  # TODO: delete this once dynamic kubelet config is available
    21  
    22  # pylint: disable=line-too-long
    23  
    24  
    25  import os
    26  import sys
    27  import subprocess
    28  
    29  # xref prow/Makefile get-build-cluster-credentials
    30  CLUSTER = 'prow'
    31  ZONE = 'us-central1-f'
    32  PROJECT = 'k8s-prow-builds'
    33  
    34  AUTH_TO_CLUSTER_COMMAND = 'gcloud container clusters get-credentials %s --project=%s --zone=%s' % (CLUSTER, PROJECT, ZONE)
    35  
    36  # this should be 20% more than the hard eviction threshold
    37  # the grace period should be longer than the typical time for another pod to be cleaned up by sinker
    38  KUBELET_ARGS_TO_ADD = '--eviction-soft=nodefs.available<30% --eviction-soft-grace-period=nodefs.available=2h'
    39  # commands used *in order* to update the kubelet
    40  KUBELET_UPDATE_COMMANDS = [
    41      # this works because the ExecStart line normally ends with $KUBELET_OPTS
    42      # so we replace `KUBELET_OPTS.*` (to the end of the line) with `KUBELET_OPTS --some --args ---we --want`
    43      "sudo sed -i 's/KUBELET_OPTS.*/KUBELET_OPTS %s/' /etc/systemd/system/kubelet.service" % KUBELET_ARGS_TO_ADD,
    44      "sudo systemctl daemon-reload",
    45      "sudo systemctl restart kubelet"
    46  ]
    47  
    48  def get_nodes():
    49      command = ['kubectl', 'get', 'nodes']
    50      res = subprocess.check_output(command, encoding='utf-8')
    51      nodes = []
    52      for line in res.split('\n')[1:]:
    53          node = line.split(' ')[0]
    54          if node != '':
    55              nodes.append(node)
    56      return nodes
    57  
    58  
    59  def run_on_node(node_name, command):
    60      print("node: %s running: %s" % (node_name, command))
    61      subprocess.call(['gcloud', 'compute', 'ssh', '--project='+PROJECT, '--zone='+ZONE, '--command='+command, node_name])
    62  
    63  def main():
    64      if sys.argv[-1] != "--yes-i-accept-that-this-is-very-risky":
    65          print("This command is very risky and unsupported (!)")
    66          print("Do not run this unless you know what you are doing and accept the consequences (!)")
    67          sys.exit(-1)
    68  
    69      # auth to the cluster
    70      print('getting cluster auth...')
    71      os.system(AUTH_TO_CLUSTER_COMMAND)
    72      print('')
    73  
    74      # get the list of nodes
    75      print('getting nodes...')
    76      nodes = get_nodes()
    77      print("got %d nodes." % len(nodes))
    78      print('')
    79  
    80      # run our service patch command on the nodes
    81      print('updating kubelet service on the nodes...')
    82      for node in nodes:
    83          print("\nupdating node: %s" % node)
    84          for command in KUBELET_UPDATE_COMMANDS:
    85              run_on_node(node, command)
    86  
    87      print('\ndone')
    88  
    89  if __name__ == '__main__':
    90      main()