k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/boskos/cmd/janitor/gcp_janitor.py (about)

     1  #!/usr/bin/env python3
     2  
     3  # Copyright 2016 The Kubernetes Authors.
     4  #
     5  # Licensed under the Apache License, Version 2.0 (the "License");
     6  # you may not use this file except in compliance with the License.
     7  # You may obtain a copy of the License at
     8  #
     9  #     http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  
    17  """Clean up resources from gcp projects. """
    18  
    19  import argparse
    20  import collections
    21  import datetime
    22  import json
    23  import os
    24  import subprocess
    25  import sys
    26  import threading
    27  
    28  # A resource that need to be cleared.
    29  Resource = collections.namedtuple(
    30      'Resource', 'api_version group name subgroup condition managed tolerate bulk_delete')
    31  DEMOLISH_ORDER = [
    32      # [WARNING FROM KRZYZACY] : TOUCH THIS WITH CARE!
    33      # ORDER REALLY MATTERS HERE!
    34  
    35      # compute resources
    36      Resource('', 'compute', 'instances', None, 'zone', None, False, True),
    37      Resource('', 'compute', 'addresses', None, 'global', None, False, True),
    38      Resource('', 'compute', 'addresses', None, 'region', None, False, True),
    39      Resource('', 'compute', 'disks', None, 'zone', None, False, True),
    40      Resource('', 'compute', 'disks', None, 'region', None, False, True),
    41      Resource('', 'compute', 'firewall-rules', None, None, None, False, True),
    42      Resource('', 'compute', 'forwarding-rules', None, 'global', None, False, True),
    43      Resource('', 'compute', 'forwarding-rules', None, 'region', None, False, True),
    44      Resource('', 'compute', 'target-http-proxies', None, 'global', None, False, True),
    45      Resource('', 'compute', 'target-http-proxies', None, 'region', None, False, True),
    46      Resource('', 'compute', 'target-https-proxies', None, 'global', None, False, True),
    47      Resource('', 'compute', 'target-https-proxies', None, 'region', None, False, True),
    48      Resource('', 'compute', 'target-tcp-proxies', None, None, None, False, True),
    49      Resource('', 'compute', 'ssl-certificates', None, 'global', None, False, True),
    50      Resource('', 'compute', 'ssl-certificates', None, 'region', None, False, True),
    51      Resource('', 'compute', 'url-maps', None, 'global', None, False, True),
    52      Resource('', 'compute', 'url-maps', None, 'region', None, False, True),
    53      Resource('', 'compute', 'backend-services', None, 'global', None, False, True),
    54      Resource('', 'compute', 'backend-services', None, 'region', None, False, True),
    55      Resource('', 'compute', 'target-pools', None, 'region', None, False, True),
    56      Resource('', 'compute', 'health-checks', None, 'global', None, False, True),
    57      Resource('', 'compute', 'health-checks', None, 'region', None, False, True),
    58      Resource('', 'compute', 'http-health-checks', None, None, None, False, True),
    59      Resource('', 'compute', 'instance-groups', None, 'region', 'Yes', False, True),
    60      Resource('', 'compute', 'instance-groups', None, 'zone', 'Yes', False, True),
    61      Resource('', 'compute', 'instance-groups', None, 'zone', 'No', False, True),
    62      Resource('', 'compute', 'instance-templates', None, None, None, False, True),
    63      Resource('', 'compute', 'sole-tenancy', 'node-groups', 'zone', None, False, True),
    64      Resource('', 'compute', 'sole-tenancy', 'node-templates', 'region', None, False, True),
    65      Resource('', 'compute', 'network-endpoint-groups', None, 'zone', None, False, False),
    66      Resource('', 'compute', 'routes', None, None, None, False, True),
    67      Resource('', 'compute', 'routers', None, 'region', None, False, True),
    68      Resource('', 'compute', 'networks', 'subnets', 'region', None, True, True),
    69      Resource('', 'compute', 'networks', None, None, None, False, True),
    70  
    71      # logging resources
    72      Resource('', 'logging', 'sinks', None, None, None, False, False),
    73  ]
    74  
    75  
    76  def log(message):
    77      """ print a message if --verbose is set. """
    78      if ARGS.verbose:
    79          tss = "[" + str(datetime.datetime.now()) + "] "
    80          print(tss + message + '\n')
    81  
    82  
    83  def base_command(resource):
    84      """ Return the base gcloud command with api_version, group and subgroup.
    85  
    86      Args:
    87          resource: Definition of a type of gcloud resource.
    88      Returns:
    89          list of base commands of gcloud .
    90      """
    91  
    92      base = ['gcloud']
    93      if resource.api_version:
    94          base += [resource.api_version]
    95      base += [resource.group, '-q', resource.name]
    96      if resource.subgroup:
    97          base.append(resource.subgroup)
    98      return base
    99  
   100  
   101  def validate_item(item, age, resource, clear_all):
   102      """ Validate if an item need to be cleaned.
   103  
   104      Args:
   105          item: a gcloud resource item from json format.
   106          age: Time cutoff from the creation of a resource.
   107          resource: Definition of a type of gcloud resource.
   108          clear_all: If need to clean regardless of timestamp.
   109      Returns:
   110          True if object need to be cleaned, False otherwise.
   111      Raises:
   112          ValueError if json result from gcloud is invalid.
   113      """
   114  
   115      if resource.managed:
   116          if 'isManaged' not in item:
   117              raise ValueError(resource.name, resource.managed)
   118          if resource.managed != item['isManaged']:
   119              return False
   120  
   121      # clears everything without checking creationTimestamp
   122      if clear_all:
   123          return True
   124  
   125      if 'creationTimestamp' not in item:
   126          raise ValueError('missing key: creationTimestamp - %r' % item)
   127  
   128      # Unify datetime to use utc timezone.
   129      created = datetime.datetime.strptime(item['creationTimestamp'], '%Y-%m-%dT%H:%M:%S')
   130      log('Found %r(%r), %r, created time = %r' %
   131          (resource.name, resource.subgroup, item['name'], item['creationTimestamp']))
   132      if created < age:
   133          log('Added to janitor list: %r(%r), %r' %
   134              (resource.name, resource.subgroup, item['name']))
   135          return True
   136      return False
   137  
   138  
   139  def collect(project, age, resource, filt, clear_all):
   140      """ Collect a list of resources for each condition (zone or region).
   141  
   142      Args:
   143          project: The name of a gcp project.
   144          age: Time cutoff from the creation of a resource.
   145          resource: Definition of a type of gcloud resource.
   146          filt: Filter clause for gcloud list command.
   147          clear_all: If need to clean regardless of timestamp.
   148      Returns:
   149          A dict of condition : list of gcloud resource object.
   150      Raises:
   151          ValueError if json result from gcloud is invalid.
   152          subprocess.CalledProcessError if cannot list the gcloud resource
   153      """
   154  
   155      col = collections.defaultdict(list)
   156  
   157      # TODO(krzyzacy): logging sink does not have timestamp
   158      #                 don't even bother listing it if not clear_all
   159      if resource.name == 'sinks' and not clear_all:
   160          return col
   161  
   162      cmd = base_command(resource)
   163      cmd.extend([
   164          'list',
   165          '--format=json(name,creationTimestamp.date(tz=UTC),zone,region,isManaged)',
   166          '--filter=%s' % filt,
   167          '--project=%s' % project])
   168      if resource.condition == 'zone' and resource.name != 'sole-tenancy' and resource.name != 'network-endpoint-groups':
   169          cmd.append('--zones=asia-east1-a,asia-east1-b,asia-east1-c,asia-east2-a,asia-east2-b,asia-east2-c,' +
   170              'asia-northeast1-a,asia-northeast1-b,asia-northeast1-c,asia-northeast2-a,asia-northeast2-b,asia-northeast2-c,' +
   171              'asia-northeast3-a,asia-northeast3-b,asia-northeast3-c,asia-south1-a,asia-south1-b,asia-south1-c,' +
   172              'asia-southeast1-a,asia-southeast1-b,asia-southeast1-c,australia-southeast1-a,australia-southeast1-b,' +
   173              'australia-southeast1-c,europe-north1-a,europe-north1-b,europe-north1-c,europe-west1-b,europe-west1-c,' +
   174              'europe-west1-d,europe-west2-a,europe-west2-b,europe-west2-c,europe-west3-a,europe-west3-b,europe-west3-c,' +
   175              'europe-west4-a,europe-west4-b,europe-west4-c,europe-west6-a,europe-west6-b,europe-west6-c,' +
   176              'northamerica-northeast1-a,northamerica-northeast1-b,northamerica-northeast1-c,southamerica-east1-a,' +
   177              'southamerica-east1-b,southamerica-east1-c,us-central1-a,us-central1-b,us-central1-c,us-central1-f,' +
   178              'us-east1-b,us-east1-c,us-east1-d,us-east4-a,us-east4-b,us-east4-c,us-west1-a,us-west1-b,us-west1-c,' +
   179              'us-west2-a,us-west2-b,us-west2-c,us-west3-a,us-west3-b,us-west3-c')
   180      log('%r' % cmd)
   181  
   182      # TODO(krzyzacy): work around for alpha API list calls
   183      try:
   184          items = subprocess.check_output(cmd)
   185      except subprocess.CalledProcessError:
   186          if resource.tolerate:
   187              return col
   188          raise
   189  
   190      for item in json.loads(items):
   191          log('parsing item: %r' % item)
   192  
   193          if 'name' not in item:
   194              raise ValueError('missing key: name - %r' % item)
   195  
   196          colname = ''
   197          if resource.condition is not None:
   198              # This subcommand will want either a --global, --region, or --zone
   199              # flag, so segment items accordingly.
   200              if resource.condition == 'global':
   201                  if 'zone' in item or 'region' in item:
   202                      # This item is zonal or regional, so don't include it in
   203                      # the global list.
   204                      continue
   205              elif resource.condition in item:
   206                  # Looking for zonal or regional items, and this matches.
   207                  # The zone or region is sometimes a full URL (why?), but
   208                  # subcommands want just the name, not the full URL, so strip it.
   209                  colname = item[resource.condition].rsplit('/', 1)[-1]
   210                  log('looking for items in %s=%s' % (resource.condition, colname))
   211              else:
   212                  # This item doesn't match the condition, so don't include it.
   213                  continue
   214  
   215          if validate_item(item, age, resource, clear_all):
   216              col[colname].append(item['name'])
   217      return col
   218  
   219  def asyncCall(cmd, tolerate, name, errs, lock, hide_output):
   220      log('%sCall %r' % ('[DRYRUN] ' if ARGS.dryrun else '', cmd))
   221      if ARGS.dryrun:
   222          return
   223      try:
   224          if hide_output:
   225              FNULL = open(os.devnull, 'w')
   226              subprocess.check_call(cmd, stdout=FNULL)
   227          else:
   228              subprocess.check_call(cmd)
   229      except subprocess.CalledProcessError as exc:
   230          if not tolerate:
   231              with lock:
   232                  errs.append(exc)
   233          print('Error try to delete resources %s: %r' % (name, exc), file=sys.stderr)
   234  
   235  def clear_resources(project, cols, resource, rate_limit):
   236      """Clear a collection of resource, from collect func above.
   237  
   238      Args:
   239          project: The name of a gcp project.
   240          cols: A dict of collection of resource.
   241          resource: Definition of a type of gcloud resource.
   242          rate_limit: how many resources to delete per gcloud delete call
   243      Returns:
   244          0 if no error
   245          > 0 if deletion command fails
   246      """
   247      errs = []
   248      threads = list()
   249      lock = threading.Lock()
   250  
   251      # delete one resource at a time, if there's no api support
   252      # aka, logging sinks for example
   253      if not resource.bulk_delete:
   254          rate_limit = 1
   255  
   256      for col, items in list(cols.items()):
   257          manage_key = {'Yes': 'managed', 'No': 'unmanaged'}
   258  
   259          # construct the customized gcloud command
   260          base = base_command(resource)
   261          if resource.managed:
   262              base.append(manage_key[resource.managed])
   263          base.append('delete')
   264          base.append('--project=%s' % project)
   265  
   266          condition = None
   267          if resource.condition and col:
   268              condition = '--%s=%s' % (resource.condition, col)
   269          elif resource.condition == 'global':
   270              condition = '--global'
   271  
   272          log('going to delete %d %s' % (len(items), resource.name))
   273          # try to delete at most $rate_limit items at a time
   274          for idx in range(0, len(items), rate_limit):
   275              clean = items[idx:idx + rate_limit]
   276              cmd = base + list(clean)
   277              if condition:
   278                  cmd.append(condition)
   279              thread = threading.Thread(
   280                  target=asyncCall, args=(cmd, resource.tolerate, resource.name, errs, lock, False))
   281              threads.append(thread)
   282              log('start a new thread, total %d' % len(threads))
   283              thread.start()
   284  
   285      log('Waiting for all %d thread to finish' % len(threads))
   286      for thread in threads:
   287          thread.join()
   288      return len(errs)
   289  
   290  
   291  def clean_gke_cluster(project, age, filt):
   292      """Clean up potential leaking gke cluster"""
   293  
   294      # a cluster can be created in one of those three endpoints
   295      endpoints = [
   296          'https://test-container.sandbox.googleapis.com/',  # test
   297          'https://staging-container.sandbox.googleapis.com/',  # staging
   298          'https://staging2-container.sandbox.googleapis.com/', # staging2
   299          'https://container.googleapis.com/',  # prod
   300      ]
   301  
   302      errs = []
   303  
   304      for endpoint in endpoints:
   305          threads = list()
   306          lock = threading.Lock()
   307  
   308          os.environ['CLOUDSDK_API_ENDPOINT_OVERRIDES_CONTAINER'] = endpoint
   309          log("checking endpoint %s" % endpoint)
   310          cmd = [
   311              'gcloud', 'container', '-q', 'clusters', 'list',
   312              '--project=%s' % project,
   313              '--filter=%s' % filt,
   314              '--format=json(name,createTime,region,zone)'
   315          ]
   316          log('running %s' % cmd)
   317  
   318          output = ''
   319          try:
   320              output = subprocess.check_output(cmd)
   321          except subprocess.CalledProcessError as exc:
   322              # expected error
   323              log('Cannot reach endpoint %s with %r, continue' % (endpoint, exc))
   324              continue
   325  
   326          for item in json.loads(output):
   327              log('cluster info: %r' % item)
   328              if 'name' not in item or 'createTime' not in item:
   329                  raise ValueError('name and createTime must be present: %r' % item)
   330              if not ('zone' in item or 'region' in item):
   331                  raise ValueError('either zone or region must be present: %r' % item)
   332  
   333              # The raw createTime string looks like 2017-08-30T18:33:14+00:00
   334              # Which python 2.7 does not support timezones.
   335              # Since age is already in UTC time we'll just strip the timezone part
   336              item['createTime'] = item['createTime'].split('+')[0]
   337              created = datetime.datetime.strptime(
   338                  item['createTime'], '%Y-%m-%dT%H:%M:%S')
   339  
   340              if created < age:
   341                  log('Found stale gke cluster %r in %r, created time = %r' %
   342                      (item['name'], endpoint, item['createTime']))
   343                  delete = [
   344                      'gcloud', 'container', '-q', 'clusters', 'delete',
   345                      item['name'],
   346                      '--project=%s' % project,
   347                  ]
   348                  if 'zone' in item:
   349                      delete.append('--zone=%s' % item['zone'])
   350                  elif 'region' in item:
   351                      delete.append('--region=%s' % item['region'])
   352                  thread = threading.Thread(
   353                      target=asyncCall, args=(delete, False, item['name'], errs, lock, True))
   354                  threads.append(thread)
   355                  log('start a new thread, total %d' % len(threads))
   356                  thread.start()
   357  
   358          log('Waiting for all %d thread to finish in %s' % (len(threads), endpoint))
   359          for thread in threads:
   360              thread.join()
   361  
   362      return len(errs) > 0
   363  
   364  
   365  def activate_service_account(service_account):
   366      print('[=== Activating service_account %s ===]' % service_account)
   367      cmd = [
   368          'gcloud', 'auth', 'activate-service-account',
   369          '--key-file=%s' % service_account,
   370      ]
   371      log('running %s' % cmd)
   372  
   373      try:
   374          subprocess.check_call(cmd)
   375      except subprocess.CalledProcessError:
   376          print('Error try to activate service_account: %s' % service_account, file=sys.stderr)
   377          return 1
   378      return 0
   379  
   380  
   381  def main(project, days, hours, filt, rate_limit, service_account):
   382      """ Clean up resources from a gcp project based on it's creation time
   383  
   384      Args:
   385          project: The name of a gcp project.
   386          days/hours: days/hours of maximum lifetime of a gcp resource.
   387          filt: Resource instance filters when query.
   388      Returns:
   389          0 if no error
   390          1 if list or delete command fails
   391      """
   392  
   393      print('[=== Start Janitor on project %r ===]' % project)
   394      err = 0
   395      age = datetime.datetime.utcnow() - datetime.timedelta(days=days, hours=hours)
   396      clear_all = (days == 0 and hours == 0)
   397  
   398      if service_account:
   399          err |= activate_service_account(service_account)
   400          if err:
   401              print('Failed to activate service account %r' % (
   402                  service_account), file=sys.stderr)
   403              sys.exit(err)
   404  
   405      # try to clean a leaked GKE cluster first, rather than attempting to delete
   406      # its associated resources individually.
   407      try:
   408          err |= clean_gke_cluster(project, age, filt)
   409      except ValueError:
   410          err |= 1  # keep clean the other resource
   411          print('Fail to clean up cluster from project %r' % project, file=sys.stderr)
   412  
   413      for res in DEMOLISH_ORDER:
   414          log('Try to search for %r with condition %r, managed %r' % (
   415              res.name, res.condition, res.managed))
   416          try:
   417              col = collect(project, age, res, filt, clear_all)
   418              if col:
   419                  err |= clear_resources(project, col, res, rate_limit)
   420          except (subprocess.CalledProcessError, ValueError):
   421              err |= 1  # keep clean the other resource
   422              print('Fail to list resource %r from project %r' % (
   423                  res.name, project), file=sys.stderr)
   424  
   425      print('[=== Finish Janitor on project %r with status %r ===]' % (project, err))
   426      sys.exit(err)
   427  
   428  
   429  if __name__ == '__main__':
   430      PARSER = argparse.ArgumentParser(
   431          description='Clean up resources from an expired project')
   432      PARSER.add_argument('--project', help='Project to clean', required=True)
   433      PARSER.add_argument(
   434          '--days', type=int,
   435          help='Clean items more than --days old (added to --hours)')
   436      PARSER.add_argument(
   437          '--hours', type=float,
   438          help='Clean items more than --hours old (added to --days)')
   439      PARSER.add_argument(
   440          '--filter',
   441          default='name !~ ^default',
   442          help='Filter down to these instances')
   443      PARSER.add_argument(
   444          '--dryrun',
   445          default=False,
   446          action='store_true',
   447          help='List but not delete resources')
   448      PARSER.add_argument(
   449          '--ratelimit', type=int, default=50,
   450          help='Max number of resources to bulk clear in one gcloud delete call')
   451      PARSER.add_argument(
   452          '--verbose', action='store_true',
   453          help='Get full janitor output log')
   454      PARSER.add_argument(
   455          '--service_account',
   456          help='GCP service account',
   457          default=os.environ.get("GOOGLE_APPLICATION_CREDENTIALS", None))
   458      ARGS = PARSER.parse_args()
   459  
   460      # We want to allow --days=0 and --hours=0, so check against None instead.
   461      if ARGS.days is None and ARGS.hours is None:
   462          print('must specify --days and/or --hours', file=sys.stderr)
   463          sys.exit(1)
   464  
   465      main(ARGS.project, ARGS.days or 0, ARGS.hours or 0, ARGS.filter,
   466           ARGS.ratelimit, ARGS.service_account)