github.com/munnerz/test-infra@v0.0.0-20190108210205-ce3d181dc989/experiment/maintenance/enable_soft_eviction.py (about) 1 #!/usr/bin/env python 2 3 # Copyright 2017 The Kubernetes Authors. 4 # 5 # Licensed under the Apache License, Version 2.0 (the "License"); 6 # you may not use this file except in compliance with the License. 7 # You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 17 # This script hijacks the COS kubelet service definition to set soft eviction thresholds 18 # for nodefs on the prow builds cluster 19 # USE AT YOUR OWN RISK. 20 # TODO(bentheelder): delete this once dynamic kubelet config is available 21 22 # pylint: disable=line-too-long 23 24 from __future__ import print_function 25 26 import os 27 import sys 28 import subprocess 29 30 # xref prow/Makefile get-build-cluster-credentials 31 CLUSTER = 'prow' 32 ZONE = 'us-central1-f' 33 PROJECT = 'k8s-prow-builds' 34 35 AUTH_TO_CLUSTER_COMMAND = 'gcloud container clusters get-credentials %s --project=%s --zone=%s' % (CLUSTER, PROJECT, ZONE) 36 37 # this should be 20% more than the hard eviction threshold 38 # the grace period should be longer than the typical time for another pod to be cleaned up by sinker 39 KUBELET_ARGS_TO_ADD = '--eviction-soft=nodefs.available<30% --eviction-soft-grace-period=nodefs.available=2h' 40 # commands used *in order* to update the kubelet 41 KUBELET_UPDATE_COMMANDS = [ 42 # this works because the ExecStart line normally ends with $KUBELET_OPTS 43 # so we replace `KUBELET_OPTS.*` (to the end of the line) with `KUBELET_OPTS --some --args ---we --want` 44 "sudo sed -i 's/KUBELET_OPTS.*/KUBELET_OPTS %s/' /etc/systemd/system/kubelet.service" % KUBELET_ARGS_TO_ADD, 45 "sudo systemctl daemon-reload", 46 "sudo systemctl restart kubelet" 47 ] 48 49 def get_nodes(): 50 command = ['kubectl', 'get', 'nodes'] 51 res = subprocess.check_output(command) 52 nodes = [] 53 for line in res.split('\n')[1:]: 54 node = line.split(' ')[0] 55 if node != '': 56 nodes.append(node) 57 return nodes 58 59 60 def run_on_node(node_name, command): 61 print("node: %s running: %s" % (node_name, command)) 62 subprocess.call(['gcloud', 'compute', 'ssh', '--project='+PROJECT, '--zone='+ZONE, '--command='+command, node_name]) 63 64 def main(): 65 if sys.argv[-1] != "--yes-i-accept-that-this-is-very-risky": 66 print("This command is very risky and unsupported (!)") 67 print("Do not run this unless you know what you are doing and accept the consequences (!)") 68 sys.exit(-1) 69 70 # auth to the cluster 71 print('getting cluster auth...') 72 os.system(AUTH_TO_CLUSTER_COMMAND) 73 print('') 74 75 # get the list of nodes 76 print('getting nodes...') 77 nodes = get_nodes() 78 print("got %d nodes." % len(nodes)) 79 print('') 80 81 # run our service patch command on the nodes 82 print('updating kubelet service on the nodes...') 83 for node in nodes: 84 print("\nupdating node: %s" % node) 85 for command in KUBELET_UPDATE_COMMANDS: 86 run_on_node(node, command) 87 88 print('\ndone') 89 90 if __name__ == '__main__': 91 main()