github.com/abayer/test-infra@v0.0.5/boskos/janitor/janitor.py (about)

     1  #!/usr/bin/env python
     2  
     3  # Copyright 2016 The Kubernetes Authors.
     4  #
     5  # Licensed under the Apache License, Version 2.0 (the "License");
     6  # you may not use this file except in compliance with the License.
     7  # You may obtain a copy of the License at
     8  #
     9  #     http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  
    17  """Clean up resources from gcp projects. """
    18  
    19  import argparse
    20  import collections
    21  import datetime
    22  import json
    23  import os
    24  import subprocess
    25  import sys
    26  
    27  
    28  # A resource that need to be cleared.
    29  Resource = collections.namedtuple(
    30      'Resource', 'api_version group name subgroup condition managed tolerate bulk_delete')
    31  DEMOLISH_ORDER = [
    32      # [WARNING FROM KRZYZACY] : TOUCH THIS WITH CARE!
    33      # ORDER REALLY MATTERS HERE!
    34  
    35      # compute resources
    36      Resource('', 'compute', 'instances', None, 'zone', None, False, True),
    37      Resource('', 'compute', 'addresses', None, 'region', None, False, True),
    38      Resource('', 'compute', 'disks', None, 'zone', None, False, True),
    39      Resource('', 'compute', 'firewall-rules', None, None, None, False, True),
    40      Resource('', 'compute', 'routes', None, None, None, False, True),
    41      Resource('', 'compute', 'forwarding-rules', None, 'region', None, False, True),
    42      Resource('', 'compute', 'target-http-proxies', None, None, None, False, True),
    43      Resource('', 'compute', 'target-https-proxies', None, None, None, False, True),
    44      Resource('', 'compute', 'url-maps', None, None, None, False, True),
    45      Resource('', 'compute', 'backend-services', None, 'region', None, False, True),
    46      Resource('', 'compute', 'target-pools', None, 'region', None, False, True),
    47      Resource('', 'compute', 'health-checks', None, None, None, False, True),
    48      Resource('', 'compute', 'http-health-checks', None, None, None, False, True),
    49      Resource('', 'compute', 'instance-groups', None, 'zone', 'Yes', False, True),
    50      Resource('', 'compute', 'instance-groups', None, 'zone', 'No', False, True),
    51      Resource('', 'compute', 'instance-templates', None, None, None, False, True),
    52      Resource('beta', 'compute', 'network-endpoint-groups', None, None, None, True, False),
    53      Resource('', 'compute', 'networks', 'subnets', 'region', None, True, True),
    54      Resource('', 'compute', 'networks', None, '', None, False, True),
    55      Resource('', 'compute', 'routes', None, None, None, False, True),
    56  
    57      # logging resources
    58      Resource('', 'logging', 'sinks', None, None, None, False, False),
    59  ]
    60  
    61  def log(message):
    62      """ print a message if --verbose is set. """
    63      if ARGS.verbose:
    64          print message
    65  
    66  def base_command(resource):
    67      """ Return the base gcloud command with api_version, group and subgroup.
    68  
    69      Args:
    70          resource: Definition of a type of gcloud resource.
    71      Returns:
    72          list of base commands of gcloud .
    73      """
    74  
    75      base = ['gcloud']
    76      if resource.api_version:
    77          base += [resource.api_version]
    78      base += [resource.group, '-q', resource.name]
    79      if resource.subgroup:
    80          base.append(resource.subgroup)
    81      return base
    82  
    83  
    84  def validate_item(item, age, resource, clear_all):
    85      """ Validate if an item need to be cleaned.
    86  
    87      Args:
    88          item: a gcloud resource item from json format.
    89          age: Time cutoff from the creation of a resource.
    90          resource: Definition of a type of gcloud resource.
    91          clear_all: If need to clean regardless of timestamp.
    92      Returns:
    93          True if object need to be cleaned, False otherwise.
    94      Raises:
    95          ValueError if json result from gcloud is invalid.
    96      """
    97  
    98      if resource.managed:
    99          if 'isManaged' not in item:
   100              raise ValueError(resource.name, resource.managed)
   101          if resource.managed != item['isManaged']:
   102              return False
   103  
   104      # clears everything without checking creationTimestamp
   105      if clear_all:
   106          return True
   107  
   108      if 'creationTimestamp' not in item:
   109          raise ValueError('missing key: creationTimestamp - %r' % item)
   110  
   111      # Unify datetime to use utc timezone.
   112      created = datetime.datetime.strptime(item['creationTimestamp'], '%Y-%m-%dT%H:%M:%S')
   113      log('Found %r(%r), %r, created time = %r' %
   114          (resource.name, resource.subgroup, item['name'], item['creationTimestamp']))
   115      if created < age:
   116          log('Added to janitor list: %r(%r), %r' %
   117              (resource.name, resource.subgroup, item['name']))
   118          return True
   119      return False
   120  
   121  
   122  def collect(project, age, resource, filt, clear_all):
   123      """ Collect a list of resources for each condition (zone or region).
   124  
   125      Args:
   126          project: The name of a gcp project.
   127          age: Time cutoff from the creation of a resource.
   128          resource: Definition of a type of gcloud resource.
   129          filt: Filter clause for gcloud list command.
   130          clear_all: If need to clean regardless of timestamp.
   131      Returns:
   132          A dict of condition : list of gcloud resource object.
   133      Raises:
   134          ValueError if json result from gcloud is invalid.
   135          subprocess.CalledProcessError if cannot list the gcloud resource
   136      """
   137  
   138      col = collections.defaultdict(list)
   139  
   140      # TODO(krzyzacy): logging sink does not have timestamp
   141      #                 don't even bother listing it if not clear_all
   142      if resource.name == 'sinks' and not clear_all:
   143          return col
   144  
   145      cmd = base_command(resource)
   146      cmd.extend([
   147          'list',
   148          '--format=json(name,creationTimestamp.date(tz=UTC),zone,region,isManaged)',
   149          '--filter=%s' % filt,
   150          '--project=%s' % project])
   151      log('%r' % cmd)
   152  
   153      # TODO(krzyzacy): work around for alpha API list calls
   154      try:
   155          items = subprocess.check_output(cmd)
   156      except subprocess.CalledProcessError:
   157          if resource.tolerate:
   158              return col
   159          raise
   160  
   161      for item in json.loads(items):
   162          log('parsing item: %r' % item)
   163  
   164          if 'name' not in item:
   165              raise ValueError('missing key: name - %r' % item)
   166  
   167          if resource.condition and resource.condition in item:
   168              colname = item[resource.condition]
   169              log('looking for items in %s=%s' % (resource.condition, colname))
   170          else:
   171              colname = ''
   172  
   173          if validate_item(item, age, resource, clear_all):
   174              col[colname].append(item['name'])
   175      return col
   176  
   177  
   178  def clear_resources(project, cols, resource, rate_limit):
   179      """Clear a collection of resource, from collect func above.
   180  
   181      Args:
   182          project: The name of a gcp project.
   183          cols: A dict of collection of resource.
   184          resource: Definition of a type of gcloud resource.
   185          rate_limit: how many resources to delete per gcloud delete call
   186      Returns:
   187          0 if no error
   188          1 if deletion command fails
   189      """
   190      err = 0
   191  
   192      # delete one resource at a time, if there's no api support
   193      # aka, logging sinks for example
   194      if not resource.bulk_delete:
   195          rate_limit = 1
   196  
   197      for col, items in cols.items():
   198          if ARGS.dryrun:
   199              log('Resource type %r(%r) to be deleted: %r' %
   200                  (resource.name, resource.subgroup, list(items)))
   201              continue
   202  
   203          manage_key = {'Yes':'managed', 'No':'unmanaged'}
   204  
   205          # construct the customized gcloud command
   206          base = base_command(resource)
   207          if resource.managed:
   208              base.append(manage_key[resource.managed])
   209          base.append('delete')
   210          base.append('--project=%s' % project)
   211  
   212          condition = None
   213          if resource.condition:
   214              if col:
   215                  condition = '--%s=%s' % (resource.condition, col)
   216              else:
   217                  condition = '--global'
   218  
   219          # hard code asia-southeast1-a for NEG
   220          # TODO(freehan): remove this once limitation is dropped
   221          if resource.name == 'network-endpoint-groups':
   222              condition = '--zone=asia-southeast1-a'
   223  
   224          log('going to delete %d %s' % (len(items), resource.name))
   225          # try to delete at most $rate_limit items at a time
   226          for idx in xrange(0, len(items), rate_limit):
   227              clean = items[idx:idx+rate_limit]
   228              cmd = base + list(clean)
   229              if condition:
   230                  cmd.append(condition)
   231              log('Call %r' % cmd)
   232              try:
   233                  subprocess.check_call(cmd)
   234              except subprocess.CalledProcessError as exc:
   235                  if not resource.tolerate:
   236                      err = 1
   237                  print >>sys.stderr, 'Error try to delete resources: %r' % exc
   238      return err
   239  
   240  
   241  def clean_gke_cluster(project, age, filt):
   242      """Clean up potential leaking gke cluster"""
   243  
   244      # a cluster can be created in one of those three endpoints
   245      endpoints = [
   246          'https://test-container.sandbox.googleapis.com/', # test
   247          'https://staging-container.sandbox.googleapis.com/', # staging
   248          'https://container.googleapis.com/', # prod
   249      ]
   250  
   251      err = 0
   252      for endpoint in endpoints:
   253          os.environ['CLOUDSDK_API_ENDPOINT_OVERRIDES_CONTAINER'] = endpoint
   254          log("checking endpoint %s" % endpoint)
   255          cmd = [
   256              'gcloud', 'container', '-q', 'clusters', 'list',
   257              '--project=%s' % project,
   258              '--filter=%s' % filt,
   259              '--format=json(name,createTime,zone)'
   260              ]
   261          log('running %s' % cmd)
   262  
   263          output = ''
   264          try:
   265              output = subprocess.check_output(cmd)
   266          except subprocess.CalledProcessError as exc:
   267              # expected error
   268              log('Cannot reach endpoint %s with %r, continue' % (endpoint, exc))
   269              continue
   270  
   271          for item in json.loads(output):
   272              log('cluster info: %r' % item)
   273              if 'name' not in item or 'createTime' not in item or 'zone' not in item:
   274                  raise ValueError('name, createTime and zone must present: %r' % item)
   275  
   276              # The raw createTime string looks like 2017-08-30T18:33:14+00:00
   277              # Which python 2.7 does not support timezones.
   278              # Since age is already in UTC time we'll just strip the timezone part
   279              item['createTime'] = item['createTime'].split('+')[0]
   280              created = datetime.datetime.strptime(
   281                  item['createTime'], '%Y-%m-%dT%H:%M:%S')
   282  
   283              if created < age:
   284                  log('Found stale gke cluster %r in %r, created time = %r' %
   285                      (item['name'], endpoint, item['createTime']))
   286                  delete = [
   287                      'gcloud', 'container', '-q', 'clusters', 'delete',
   288                      item['name'],
   289                      '--project=%s' % project,
   290                      '--zone=%s' % item['zone'],
   291                  ]
   292                  try:
   293                      log('running %s' % delete)
   294                      subprocess.check_call(delete)
   295                  except subprocess.CalledProcessError as exc:
   296                      err = 1
   297                      print >>sys.stderr, 'Error try to delete cluster %s: %r' % (item['name'], exc)
   298  
   299      return err
   300  
   301  def main(project, days, hours, filt, rate_limit):
   302      """ Clean up resources from a gcp project based on it's creation time
   303  
   304      Args:
   305          project: The name of a gcp project.
   306          days/hours: days/hours of maximum lifetime of a gcp resource.
   307          filt: Resource instance filters when query.
   308      Returns:
   309          0 if no error
   310          1 if list or delete command fails
   311      """
   312  
   313      print '[=== Start Janitor on project %r ===]' % project
   314      err = 0
   315      age = datetime.datetime.utcnow() - datetime.timedelta(days=days, hours=hours)
   316      clear_all = (days is 0 and hours is 0)
   317      for res in DEMOLISH_ORDER:
   318          log('Try to search for %r with condition %r' % (res.name, res.condition))
   319          try:
   320              col = collect(project, age, res, filt, clear_all)
   321              if col:
   322                  err |= clear_resources(project, col, res, rate_limit)
   323          except (subprocess.CalledProcessError, ValueError):
   324              err |= 1 # keep clean the other resource
   325              print >>sys.stderr, 'Fail to list resource %r from project %r' % (res.name, project)
   326  
   327      # try to clean leaking gke cluster
   328      try:
   329          err |= clean_gke_cluster(project, age, filt)
   330      except ValueError:
   331          err |= 1 # keep clean the other resource
   332          print >>sys.stderr, 'Fail to clean up cluster from project %r' % project
   333  
   334      print '[=== Finish Janitor on project %r with status %r ===]' % (project, err)
   335      sys.exit(err)
   336  
   337  
   338  if __name__ == '__main__':
   339      PARSER = argparse.ArgumentParser(
   340          description='Clean up resources from an expired project')
   341      PARSER.add_argument('--project', help='Project to clean', required=True)
   342      PARSER.add_argument(
   343          '--days', type=int,
   344          help='Clean items more than --days old (added to --hours)')
   345      PARSER.add_argument(
   346          '--hours', type=float,
   347          help='Clean items more than --hours old (added to --days)')
   348      PARSER.add_argument(
   349          '--filter',
   350          default='name !~ ^default',
   351          help='Filter down to these instances')
   352      PARSER.add_argument(
   353          '--dryrun',
   354          default=False,
   355          action='store_true',
   356          help='List but not delete resources')
   357      PARSER.add_argument(
   358          '--ratelimit', type=int, default=50,
   359          help='Max number of resources to bulk clear in one gcloud delete call')
   360      PARSER.add_argument(
   361          '--verbose', action='store_true',
   362          help='Get full janitor output log')
   363      ARGS = PARSER.parse_args()
   364  
   365      # We want to allow --days=0 and --hours=0, so check against None instead.
   366      if ARGS.days is None and ARGS.hours is None:
   367          print >>sys.stderr, 'must specify --days and/or --hours'
   368          sys.exit(1)
   369  
   370      main(ARGS.project, ARGS.days or 0, ARGS.hours or 0, ARGS.filter, ARGS.ratelimit)