github.com/munnerz/test-infra@v0.0.0-20190108210205-ce3d181dc989/experiment/maintenance/enable_soft_eviction.py (about)

     1  #!/usr/bin/env python
     2  
     3  # Copyright 2017 The Kubernetes Authors.
     4  #
     5  # Licensed under the Apache License, Version 2.0 (the "License");
     6  # you may not use this file except in compliance with the License.
     7  # You may obtain a copy of the License at
     8  #
     9  #     http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  
    17  # This script hijacks the COS kubelet service definition to set soft eviction thresholds
    18  # for nodefs on the prow builds cluster
    19  # USE AT YOUR OWN RISK.
    20  # TODO(bentheelder): delete this once dynamic kubelet config is available
    21  
    22  # pylint: disable=line-too-long
    23  
    24  from __future__ import print_function
    25  
    26  import os
    27  import sys
    28  import subprocess
    29  
    30  # xref prow/Makefile get-build-cluster-credentials
    31  CLUSTER = 'prow'
    32  ZONE = 'us-central1-f'
    33  PROJECT = 'k8s-prow-builds'
    34  
    35  AUTH_TO_CLUSTER_COMMAND = 'gcloud container clusters get-credentials %s --project=%s --zone=%s' % (CLUSTER, PROJECT, ZONE)
    36  
    37  # this should be 20% more than the hard eviction threshold
    38  # the grace period should be longer than the typical time for another pod to be cleaned up by sinker
    39  KUBELET_ARGS_TO_ADD = '--eviction-soft=nodefs.available<30% --eviction-soft-grace-period=nodefs.available=2h'
    40  # commands used *in order* to update the kubelet
    41  KUBELET_UPDATE_COMMANDS = [
    42      # this works because the ExecStart line normally ends with $KUBELET_OPTS
    43      # so we replace `KUBELET_OPTS.*` (to the end of the line) with `KUBELET_OPTS --some --args ---we --want`
    44      "sudo sed -i 's/KUBELET_OPTS.*/KUBELET_OPTS %s/' /etc/systemd/system/kubelet.service" % KUBELET_ARGS_TO_ADD,
    45      "sudo systemctl daemon-reload",
    46      "sudo systemctl restart kubelet"
    47  ]
    48  
    49  def get_nodes():
    50      command = ['kubectl', 'get', 'nodes']
    51      res = subprocess.check_output(command)
    52      nodes = []
    53      for line in res.split('\n')[1:]:
    54          node = line.split(' ')[0]
    55          if node != '':
    56              nodes.append(node)
    57      return nodes
    58  
    59  
    60  def run_on_node(node_name, command):
    61      print("node: %s running: %s" % (node_name, command))
    62      subprocess.call(['gcloud', 'compute', 'ssh', '--project='+PROJECT, '--zone='+ZONE, '--command='+command, node_name])
    63  
    64  def main():
    65      if sys.argv[-1] != "--yes-i-accept-that-this-is-very-risky":
    66          print("This command is very risky and unsupported (!)")
    67          print("Do not run this unless you know what you are doing and accept the consequences (!)")
    68          sys.exit(-1)
    69  
    70      # auth to the cluster
    71      print('getting cluster auth...')
    72      os.system(AUTH_TO_CLUSTER_COMMAND)
    73      print('')
    74  
    75      # get the list of nodes
    76      print('getting nodes...')
    77      nodes = get_nodes()
    78      print("got %d nodes." % len(nodes))
    79      print('')
    80  
    81      # run our service patch command on the nodes
    82      print('updating kubelet service on the nodes...')
    83      for node in nodes:
    84          print("\nupdating node: %s" % node)
    85          for command in KUBELET_UPDATE_COMMANDS:
    86              run_on_node(node, command)
    87  
    88      print('\ndone')
    89  
    90  if __name__ == '__main__':
    91      main()