k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/experiment/maintenance/enable_soft_eviction.py (about) 1 #!/usr/bin/env python3 2 3 # Copyright 2017 The Kubernetes Authors. 4 # 5 # Licensed under the Apache License, Version 2.0 (the "License"); 6 # you may not use this file except in compliance with the License. 7 # You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 17 # This script hijacks the COS kubelet service definition to set soft eviction thresholds 18 # for nodefs on the prow builds cluster 19 # USE AT YOUR OWN RISK. 20 # TODO: delete this once dynamic kubelet config is available 21 22 # pylint: disable=line-too-long 23 24 25 import os 26 import sys 27 import subprocess 28 29 # xref prow/Makefile get-build-cluster-credentials 30 CLUSTER = 'prow' 31 ZONE = 'us-central1-f' 32 PROJECT = 'k8s-prow-builds' 33 34 AUTH_TO_CLUSTER_COMMAND = 'gcloud container clusters get-credentials %s --project=%s --zone=%s' % (CLUSTER, PROJECT, ZONE) 35 36 # this should be 20% more than the hard eviction threshold 37 # the grace period should be longer than the typical time for another pod to be cleaned up by sinker 38 KUBELET_ARGS_TO_ADD = '--eviction-soft=nodefs.available<30% --eviction-soft-grace-period=nodefs.available=2h' 39 # commands used *in order* to update the kubelet 40 KUBELET_UPDATE_COMMANDS = [ 41 # this works because the ExecStart line normally ends with $KUBELET_OPTS 42 # so we replace `KUBELET_OPTS.*` (to the end of the line) with `KUBELET_OPTS --some --args ---we --want` 43 "sudo sed -i 's/KUBELET_OPTS.*/KUBELET_OPTS %s/' /etc/systemd/system/kubelet.service" % KUBELET_ARGS_TO_ADD, 44 "sudo systemctl daemon-reload", 45 "sudo systemctl restart kubelet" 46 ] 47 48 def get_nodes(): 49 command = ['kubectl', 'get', 'nodes'] 50 res = subprocess.check_output(command, encoding='utf-8') 51 nodes = [] 52 for line in res.split('\n')[1:]: 53 node = line.split(' ')[0] 54 if node != '': 55 nodes.append(node) 56 return nodes 57 58 59 def run_on_node(node_name, command): 60 print("node: %s running: %s" % (node_name, command)) 61 subprocess.call(['gcloud', 'compute', 'ssh', '--project='+PROJECT, '--zone='+ZONE, '--command='+command, node_name]) 62 63 def main(): 64 if sys.argv[-1] != "--yes-i-accept-that-this-is-very-risky": 65 print("This command is very risky and unsupported (!)") 66 print("Do not run this unless you know what you are doing and accept the consequences (!)") 67 sys.exit(-1) 68 69 # auth to the cluster 70 print('getting cluster auth...') 71 os.system(AUTH_TO_CLUSTER_COMMAND) 72 print('') 73 74 # get the list of nodes 75 print('getting nodes...') 76 nodes = get_nodes() 77 print("got %d nodes." % len(nodes)) 78 print('') 79 80 # run our service patch command on the nodes 81 print('updating kubelet service on the nodes...') 82 for node in nodes: 83 print("\nupdating node: %s" % node) 84 for command in KUBELET_UPDATE_COMMANDS: 85 run_on_node(node, command) 86 87 print('\ndone') 88 89 if __name__ == '__main__': 90 main()