github.com/abayer/test-infra@v0.0.5/experiment/maintenance/shift_nodepool_capacity.py (about) 1 #!/usr/bin/env python 2 3 # Copyright 2018 The Kubernetes Authors. 4 # 5 # Licensed under the Apache License, Version 2.0 (the "License"); 6 # you may not use this file except in compliance with the License. 7 # You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 17 # This script drains nodes from one node pool and adds nodes to another n:m at a time 18 # 19 # Use like: 20 # shift_nodepool_capacity.py pool-to-drain pool-to-grow shrink_increment:grow_increment num_to_add 21 # 22 # EG: 23 # shift_nodepool_capacity.py default-pool pool-n1-highmem-8-300gb 2:1 5 24 # 25 # for nodefs on the prow builds cluster 26 # USE AT YOUR OWN RISK. 27 # TODO(bentheelder): delete this once dynamic kubelet config is available 28 29 30 from __future__ import print_function 31 32 import sys 33 import subprocess 34 import json 35 import math 36 37 # xref prow/Makefile get-build-cluster-credentials 38 # TODO(bentheelder): perhaps make these configurable 39 CLUSTER = 'prow' 40 ZONE = 'us-central1-f' 41 PROJECT = 'k8s-prow-builds' 42 43 44 def get_pool_sizes(project, zone, cluster): 45 """returns a map of node pool name to size using the gcloud cli.""" 46 sizes = {} 47 48 # map managed instance group names to node pools and record pool names 49 node_pools = json.loads(subprocess.check_output([ 50 'gcloud', 'container', 'node-pools', 'list', 51 '--project', project, '--cluster', cluster, '--zone', zone, 52 '--format=json', 53 ])) 54 group_to_pool = {} 55 for pool in node_pools: 56 # later on we will sum up node counts from instance groups 57 sizes[pool['name']] = 0 58 # this is somewhat brittle, the last component of the URL is the instance group name 59 # the better way to do this is probably to use the APIs directly 60 for url in pool['instanceGroupUrls']: 61 instance_group = url.split('/')[-1] 62 group_to_pool[instance_group] = pool['name'] 63 64 # map instance groups to node counts 65 groups = json.loads(subprocess.check_output([ 66 'gcloud', 'compute', 'instance-groups', 'list', 67 '--project', project, '--filter=zone:({})'.format(zone), 68 '--format=json', 69 ])) 70 for group in groups: 71 sizes[group_to_pool[group['name']]] += group['size'] 72 73 return sizes 74 75 76 def resize_nodepool(pool, new_size, project, zone, cluster): 77 """resize the nodepool to new_size using the gcloud cli""" 78 cmd = [ 79 'gcloud', 'container', 'clusters', 'resize', cluster, 80 '--zone', zone, '--project', project, '--node-pool', pool, 81 '--size', str(new_size), '--quiet', 82 ] 83 print(cmd) 84 subprocess.call(cmd) 85 86 87 def prompt_confirmation(): 88 """prompts for interactive confirmation, exits 1 unless input is 'yes'""" 89 sys.stdout.write('Please confirm (yes/no): ') 90 response = raw_input() 91 if response != 'yes': 92 print('Cancelling.') 93 sys.exit(-1) 94 print('Confirmed.') 95 96 97 def main(): 98 # parse cli 99 nodes_to_add = int(sys.argv[-1]) 100 101 ratio = sys.argv[-2].split(':') 102 shrink_increment, grow_increment = int(ratio[0]), int(ratio[1]) 103 104 pool_to_grow = sys.argv[-3] 105 pool_to_shrink = sys.argv[-4] 106 107 # obtain current pool sizes 108 pool_sizes = get_pool_sizes(PROJECT, ZONE, CLUSTER) 109 pool_to_grow_initial = pool_sizes[pool_to_grow] 110 pool_to_shrink_initial = pool_sizes[pool_to_shrink] 111 112 # compute final pool sizes 113 pool_to_grow_target = pool_to_grow_initial + nodes_to_add 114 115 n_iter = int(math.ceil(float(nodes_to_add) / grow_increment)) 116 pool_to_shrink_target = pool_to_shrink_initial - n_iter*shrink_increment 117 if pool_to_shrink_target < 0: 118 pool_to_shrink_target = 0 119 120 # verify with the user 121 print(( 122 'Shifting NodePool capacity for project = "{project}",' 123 'zone = "{zone}", cluster = "{cluster}"' 124 ).format( 125 project=PROJECT, zone=ZONE, cluster=CLUSTER, 126 )) 127 print('') 128 print(( 129 'Will add {nodes_to_add} node(s) to {pool_to_grow}' 130 ' and drain {shrink_increment} node(s) from {pool_to_shrink}' 131 ' for every {grow_increment} node(s) added to {pool_to_grow}' 132 ).format( 133 nodes_to_add=nodes_to_add, shrink_increment=shrink_increment, 134 grow_increment=grow_increment, pool_to_grow=pool_to_grow, 135 pool_to_shrink=pool_to_shrink, 136 )) 137 print('') 138 print(( 139 'Current pool sizes are: {{{pool_to_grow}: {pool_to_grow_curr},' 140 ' {pool_to_shrink}: {pool_to_shrink_curr}}}' 141 ).format( 142 pool_to_grow=pool_to_grow, pool_to_grow_curr=pool_to_grow_initial, 143 pool_to_shrink=pool_to_shrink, pool_to_shrink_curr=pool_to_shrink_initial, 144 )) 145 print('') 146 print(( 147 'Target pool sizes are: {{{pool_to_grow}: {pool_to_grow_target},' 148 ' {pool_to_shrink}: {pool_to_shrink_target}}}' 149 ).format( 150 pool_to_grow=pool_to_grow, pool_to_grow_target=pool_to_grow_target, 151 pool_to_shrink=pool_to_shrink, pool_to_shrink_target=pool_to_shrink_target, 152 )) 153 print('') 154 155 prompt_confirmation() 156 print('') 157 158 159 # actually start resizing 160 # ignore pylint, "i" is a perfectly fine variable name for a loop counter... 161 # pylint: disable=invalid-name 162 for i in range(n_iter): 163 # shrink by one increment, capped at reaching zero nodes 164 print('Draining {shrink_increment} node(s) from {pool_to_shrink} ...'.format( 165 shrink_increment=shrink_increment, pool_to_shrink=pool_to_shrink, 166 )) 167 new_size = max(pool_to_shrink_initial - (i*shrink_increment + shrink_increment), 0) 168 resize_nodepool(pool_to_shrink, new_size, PROJECT, ZONE, CLUSTER) 169 print('') 170 171 # ditto for growing, modulo the cap 172 num_to_add = min(grow_increment, pool_to_grow_target - i*grow_increment) 173 print('Adding {num_to_add} node(s) to {pool_to_grow} ...'.format( 174 num_to_add=num_to_add, pool_to_grow=pool_to_grow, 175 )) 176 new_size = pool_to_grow_initial + (i*grow_increment + num_to_add) 177 resize_nodepool(pool_to_grow, new_size, PROJECT, ZONE, CLUSTER) 178 print('') 179 180 print('') 181 print('Done') 182 183 if __name__ == '__main__': 184 main()