k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/experiment/maintenance/shift_nodepool_capacity.py (about)

     1  #!/usr/bin/env python3
     2  
     3  # Copyright 2018 The Kubernetes Authors.
     4  #
     5  # Licensed under the Apache License, Version 2.0 (the "License");
     6  # you may not use this file except in compliance with the License.
     7  # You may obtain a copy of the License at
     8  #
     9  #     http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  
    17  # This script drains nodes from one node pool and adds nodes to another n:m at a time
    18  #
    19  # Use like:
    20  # shift_nodepool_capacity.py pool-to-drain pool-to-grow shrink_increment:grow_increment num_to_add
    21  #
    22  # EG:
    23  # shift_nodepool_capacity.py 5 2:1 default-pool pool-n1-highmem-8-300gb
    24  #
    25  # for nodefs on the prow builds cluster
    26  # USE AT YOUR OWN RISK.
    27  
    28  import argparse
    29  import sys
    30  import subprocess
    31  import json
    32  import math
    33  
    34  
    35  def get_pool_sizes(project, zone, cluster):
    36      """returns a map of node pool name to size using the gcloud cli."""
    37      sizes = {}
    38  
    39      # map managed instance group names to node pools and record pool names
    40      node_pools = json.loads(subprocess.check_output([
    41          'gcloud', 'container', 'node-pools', 'list',
    42          '--project', project, '--cluster', cluster, '--zone', zone,
    43          '--format=json',
    44      ], encoding='utf-8'))
    45      group_to_pool = {}
    46      for pool in node_pools:
    47          # later on we will sum up node counts from instance groups
    48          sizes[pool['name']] = 0
    49          # this is somewhat brittle, the last component of the URL is the instance group name
    50          # the better way to do this is probably to use the APIs directly
    51          for url in pool['instanceGroupUrls']:
    52              instance_group = url.split('/')[-1]
    53              group_to_pool[instance_group] = pool['name']
    54  
    55      # map instance groups to node counts
    56      groups = json.loads(subprocess.check_output([
    57          'gcloud', 'compute', 'instance-groups', 'list',
    58          '--project', project, '--filter=zone:({})'.format(zone),
    59          '--format=json',
    60      ], encoding='utf-8'))
    61      for group in groups:
    62          if group['name'] not in group_to_pool:
    63              continue
    64          sizes[group_to_pool[group['name']]] += group['size']
    65  
    66      return sizes
    67  
    68  
    69  def resize_nodepool(pool, new_size, project, zone, cluster):
    70      """resize the nodepool to new_size using the gcloud cli"""
    71      cmd = [
    72          'gcloud', 'container', 'clusters', 'resize', cluster,
    73          '--zone', zone, '--project', project, '--node-pool', pool,
    74          '--num-nodes', str(new_size), '--quiet',
    75      ]
    76      print(cmd)
    77      subprocess.call(cmd)
    78  
    79  
    80  def prompt_confirmation():
    81      """prompts for interactive confirmation, exits 1 unless input is 'yes'"""
    82      sys.stdout.write('Please confirm (yes/no): ')
    83      response = input()
    84      if response != 'yes':
    85          print('Cancelling.')
    86          sys.exit(-1)
    87      print('Confirmed.')
    88  
    89  
    90  # xref prow/Makefile get-build-cluster-credentials
    91  def parse_args(args):
    92      parser = argparse.ArgumentParser()
    93      parser.add_argument('nodes', type=int,
    94                          help='Number of Nodes to add.')
    95      parser.add_argument('ratio', type=str,
    96                          help='ShrinkIncrement:GrowIncrement, Ex 2:1.')
    97      parser.add_argument('shrink', type=str,
    98                          help='Pool name to drain nodes from.')
    99      parser.add_argument('grow', type=str,
   100                          help='Pool name to grow nodes into.')
   101      parser.add_argument('--cluster', type=str, default="prow",
   102                          help='Name of GCP cluster.')
   103      parser.add_argument('--zone', type=str, default='us-central1-f',
   104                          help='GCP zonal location of the cluster.')
   105      parser.add_argument('--project', type=str, default='k8s-prow-builds',
   106                          help='GCP Project that the cluster exists within.')
   107      return parser.parse_args(args)
   108  
   109  
   110  def main(options):
   111      # parse cli
   112      nodes_to_add = options.nodes
   113  
   114      ratio = options.ratio.split(':')
   115      shrink_increment, grow_increment = int(ratio[0]), int(ratio[1])
   116  
   117      pool_to_grow = options.grow
   118      pool_to_shrink = options.shrink
   119  
   120      # obtain current pool sizes
   121      project, zone, cluster = options.project, options.zone, options.cluster
   122      pool_sizes = get_pool_sizes(project, zone, cluster)
   123      pool_to_grow_initial = pool_sizes[pool_to_grow]
   124      pool_to_shrink_initial = pool_sizes[pool_to_shrink]
   125  
   126      # compute final pool sizes
   127      pool_to_grow_target = pool_to_grow_initial + nodes_to_add
   128  
   129      n_iter = int(math.ceil(float(nodes_to_add) / grow_increment))
   130      pool_to_shrink_target = pool_to_shrink_initial - n_iter*shrink_increment
   131      if pool_to_shrink_target < 0:
   132          pool_to_shrink_target = 0
   133  
   134      # verify with the user
   135      print((
   136          'Shifting NodePool capacity for project = "{project}",'
   137          'zone = "{zone}", cluster = "{cluster}"'
   138          ).format(
   139              project=project, zone=zone, cluster=cluster,
   140          ))
   141      print('')
   142      print((
   143          'Will add {nodes_to_add} node(s) to {pool_to_grow}'
   144          ' and drain {shrink_increment} node(s) from {pool_to_shrink}'
   145          ' for every {grow_increment} node(s) added to {pool_to_grow}'
   146          ).format(
   147              nodes_to_add=nodes_to_add, shrink_increment=shrink_increment,
   148              grow_increment=grow_increment, pool_to_grow=pool_to_grow,
   149              pool_to_shrink=pool_to_shrink,
   150          ))
   151      print('')
   152      print((
   153          'Current pool sizes are: {{{pool_to_grow}: {pool_to_grow_curr},'
   154          ' {pool_to_shrink}: {pool_to_shrink_curr}}}'
   155          ).format(
   156              pool_to_grow=pool_to_grow, pool_to_grow_curr=pool_to_grow_initial,
   157              pool_to_shrink=pool_to_shrink, pool_to_shrink_curr=pool_to_shrink_initial,
   158          ))
   159      print('')
   160      print((
   161          'Target pool sizes are: {{{pool_to_grow}: {pool_to_grow_target},'
   162          ' {pool_to_shrink}: {pool_to_shrink_target}}}'
   163          ).format(
   164              pool_to_grow=pool_to_grow, pool_to_grow_target=pool_to_grow_target,
   165              pool_to_shrink=pool_to_shrink, pool_to_shrink_target=pool_to_shrink_target,
   166          ))
   167      print('')
   168  
   169      prompt_confirmation()
   170      print('')
   171  
   172  
   173      # actually start resizing
   174      # ignore pylint, "i" is a perfectly fine variable name for a loop counter...
   175      # pylint: disable=invalid-name
   176      for i in range(n_iter):
   177          # shrink by one increment, capped at reaching zero nodes
   178          print('Draining {shrink_increment} node(s) from {pool_to_shrink} ...'.format(
   179              shrink_increment=shrink_increment, pool_to_shrink=pool_to_shrink,
   180          ))
   181          new_size = max(pool_to_shrink_initial - (i*shrink_increment + shrink_increment), 0)
   182          resize_nodepool(pool_to_shrink, new_size, project, zone, cluster)
   183          print('')
   184  
   185          # ditto for growing, modulo the cap
   186          num_to_add = min(grow_increment, pool_to_grow_target - i*grow_increment)
   187          print('Adding {num_to_add} node(s) to {pool_to_grow} ...'.format(
   188              num_to_add=num_to_add, pool_to_grow=pool_to_grow,
   189          ))
   190          new_size = pool_to_grow_initial + (i*grow_increment + num_to_add)
   191          resize_nodepool(pool_to_grow, new_size, project, zone, cluster)
   192          print('')
   193  
   194      print('')
   195      print('Done')
   196  
   197  if __name__ == '__main__':
   198      main(parse_args(sys.argv[1:]))