k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/experiment/maintenance/shift_nodepool_capacity.py (about) 1 #!/usr/bin/env python3 2 3 # Copyright 2018 The Kubernetes Authors. 4 # 5 # Licensed under the Apache License, Version 2.0 (the "License"); 6 # you may not use this file except in compliance with the License. 7 # You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 17 # This script drains nodes from one node pool and adds nodes to another n:m at a time 18 # 19 # Use like: 20 # shift_nodepool_capacity.py pool-to-drain pool-to-grow shrink_increment:grow_increment num_to_add 21 # 22 # EG: 23 # shift_nodepool_capacity.py 5 2:1 default-pool pool-n1-highmem-8-300gb 24 # 25 # for nodefs on the prow builds cluster 26 # USE AT YOUR OWN RISK. 27 28 import argparse 29 import sys 30 import subprocess 31 import json 32 import math 33 34 35 def get_pool_sizes(project, zone, cluster): 36 """returns a map of node pool name to size using the gcloud cli.""" 37 sizes = {} 38 39 # map managed instance group names to node pools and record pool names 40 node_pools = json.loads(subprocess.check_output([ 41 'gcloud', 'container', 'node-pools', 'list', 42 '--project', project, '--cluster', cluster, '--zone', zone, 43 '--format=json', 44 ], encoding='utf-8')) 45 group_to_pool = {} 46 for pool in node_pools: 47 # later on we will sum up node counts from instance groups 48 sizes[pool['name']] = 0 49 # this is somewhat brittle, the last component of the URL is the instance group name 50 # the better way to do this is probably to use the APIs directly 51 for url in pool['instanceGroupUrls']: 52 instance_group = url.split('/')[-1] 53 group_to_pool[instance_group] = pool['name'] 54 55 # map instance groups to node counts 56 groups = json.loads(subprocess.check_output([ 57 'gcloud', 'compute', 'instance-groups', 'list', 58 '--project', project, '--filter=zone:({})'.format(zone), 59 '--format=json', 60 ], encoding='utf-8')) 61 for group in groups: 62 if group['name'] not in group_to_pool: 63 continue 64 sizes[group_to_pool[group['name']]] += group['size'] 65 66 return sizes 67 68 69 def resize_nodepool(pool, new_size, project, zone, cluster): 70 """resize the nodepool to new_size using the gcloud cli""" 71 cmd = [ 72 'gcloud', 'container', 'clusters', 'resize', cluster, 73 '--zone', zone, '--project', project, '--node-pool', pool, 74 '--num-nodes', str(new_size), '--quiet', 75 ] 76 print(cmd) 77 subprocess.call(cmd) 78 79 80 def prompt_confirmation(): 81 """prompts for interactive confirmation, exits 1 unless input is 'yes'""" 82 sys.stdout.write('Please confirm (yes/no): ') 83 response = input() 84 if response != 'yes': 85 print('Cancelling.') 86 sys.exit(-1) 87 print('Confirmed.') 88 89 90 # xref prow/Makefile get-build-cluster-credentials 91 def parse_args(args): 92 parser = argparse.ArgumentParser() 93 parser.add_argument('nodes', type=int, 94 help='Number of Nodes to add.') 95 parser.add_argument('ratio', type=str, 96 help='ShrinkIncrement:GrowIncrement, Ex 2:1.') 97 parser.add_argument('shrink', type=str, 98 help='Pool name to drain nodes from.') 99 parser.add_argument('grow', type=str, 100 help='Pool name to grow nodes into.') 101 parser.add_argument('--cluster', type=str, default="prow", 102 help='Name of GCP cluster.') 103 parser.add_argument('--zone', type=str, default='us-central1-f', 104 help='GCP zonal location of the cluster.') 105 parser.add_argument('--project', type=str, default='k8s-prow-builds', 106 help='GCP Project that the cluster exists within.') 107 return parser.parse_args(args) 108 109 110 def main(options): 111 # parse cli 112 nodes_to_add = options.nodes 113 114 ratio = options.ratio.split(':') 115 shrink_increment, grow_increment = int(ratio[0]), int(ratio[1]) 116 117 pool_to_grow = options.grow 118 pool_to_shrink = options.shrink 119 120 # obtain current pool sizes 121 project, zone, cluster = options.project, options.zone, options.cluster 122 pool_sizes = get_pool_sizes(project, zone, cluster) 123 pool_to_grow_initial = pool_sizes[pool_to_grow] 124 pool_to_shrink_initial = pool_sizes[pool_to_shrink] 125 126 # compute final pool sizes 127 pool_to_grow_target = pool_to_grow_initial + nodes_to_add 128 129 n_iter = int(math.ceil(float(nodes_to_add) / grow_increment)) 130 pool_to_shrink_target = pool_to_shrink_initial - n_iter*shrink_increment 131 if pool_to_shrink_target < 0: 132 pool_to_shrink_target = 0 133 134 # verify with the user 135 print(( 136 'Shifting NodePool capacity for project = "{project}",' 137 'zone = "{zone}", cluster = "{cluster}"' 138 ).format( 139 project=project, zone=zone, cluster=cluster, 140 )) 141 print('') 142 print(( 143 'Will add {nodes_to_add} node(s) to {pool_to_grow}' 144 ' and drain {shrink_increment} node(s) from {pool_to_shrink}' 145 ' for every {grow_increment} node(s) added to {pool_to_grow}' 146 ).format( 147 nodes_to_add=nodes_to_add, shrink_increment=shrink_increment, 148 grow_increment=grow_increment, pool_to_grow=pool_to_grow, 149 pool_to_shrink=pool_to_shrink, 150 )) 151 print('') 152 print(( 153 'Current pool sizes are: {{{pool_to_grow}: {pool_to_grow_curr},' 154 ' {pool_to_shrink}: {pool_to_shrink_curr}}}' 155 ).format( 156 pool_to_grow=pool_to_grow, pool_to_grow_curr=pool_to_grow_initial, 157 pool_to_shrink=pool_to_shrink, pool_to_shrink_curr=pool_to_shrink_initial, 158 )) 159 print('') 160 print(( 161 'Target pool sizes are: {{{pool_to_grow}: {pool_to_grow_target},' 162 ' {pool_to_shrink}: {pool_to_shrink_target}}}' 163 ).format( 164 pool_to_grow=pool_to_grow, pool_to_grow_target=pool_to_grow_target, 165 pool_to_shrink=pool_to_shrink, pool_to_shrink_target=pool_to_shrink_target, 166 )) 167 print('') 168 169 prompt_confirmation() 170 print('') 171 172 173 # actually start resizing 174 # ignore pylint, "i" is a perfectly fine variable name for a loop counter... 175 # pylint: disable=invalid-name 176 for i in range(n_iter): 177 # shrink by one increment, capped at reaching zero nodes 178 print('Draining {shrink_increment} node(s) from {pool_to_shrink} ...'.format( 179 shrink_increment=shrink_increment, pool_to_shrink=pool_to_shrink, 180 )) 181 new_size = max(pool_to_shrink_initial - (i*shrink_increment + shrink_increment), 0) 182 resize_nodepool(pool_to_shrink, new_size, project, zone, cluster) 183 print('') 184 185 # ditto for growing, modulo the cap 186 num_to_add = min(grow_increment, pool_to_grow_target - i*grow_increment) 187 print('Adding {num_to_add} node(s) to {pool_to_grow} ...'.format( 188 num_to_add=num_to_add, pool_to_grow=pool_to_grow, 189 )) 190 new_size = pool_to_grow_initial + (i*grow_increment + num_to_add) 191 resize_nodepool(pool_to_grow, new_size, project, zone, cluster) 192 print('') 193 194 print('') 195 print('Done') 196 197 if __name__ == '__main__': 198 main(parse_args(sys.argv[1:]))