github.com/munnerz/test-infra@v0.0.0-20190108210205-ce3d181dc989/boskos/janitor/janitor.py (about) 1 #!/usr/bin/env python 2 3 # Copyright 2016 The Kubernetes Authors. 4 # 5 # Licensed under the Apache License, Version 2.0 (the "License"); 6 # you may not use this file except in compliance with the License. 7 # You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 17 """Clean up resources from gcp projects. """ 18 19 import argparse 20 import collections 21 import datetime 22 import json 23 import os 24 import subprocess 25 import sys 26 27 28 # A resource that need to be cleared. 29 Resource = collections.namedtuple( 30 'Resource', 'api_version group name subgroup condition managed tolerate bulk_delete') 31 DEMOLISH_ORDER = [ 32 # [WARNING FROM KRZYZACY] : TOUCH THIS WITH CARE! 33 # ORDER REALLY MATTERS HERE! 34 35 # compute resources 36 Resource('', 'compute', 'instances', None, 'zone', None, False, True), 37 Resource('', 'compute', 'addresses', None, 'region', None, False, True), 38 Resource('', 'compute', 'disks', None, 'zone', None, False, True), 39 Resource('', 'compute', 'firewall-rules', None, None, None, False, True), 40 Resource('', 'compute', 'routes', None, None, None, False, True), 41 Resource('', 'compute', 'forwarding-rules', None, 'region', None, False, True), 42 Resource('', 'compute', 'target-http-proxies', None, None, None, False, True), 43 Resource('', 'compute', 'target-https-proxies', None, None, None, False, True), 44 Resource('', 'compute', 'url-maps', None, None, None, False, True), 45 Resource('', 'compute', 'backend-services', None, 'region', None, False, True), 46 Resource('', 'compute', 'target-pools', None, 'region', None, False, True), 47 Resource('', 'compute', 'health-checks', None, None, None, False, True), 48 Resource('', 'compute', 'http-health-checks', None, None, None, False, True), 49 Resource('', 'compute', 'instance-groups', None, 'zone', 'Yes', False, True), 50 Resource('', 'compute', 'instance-groups', None, 'zone', 'No', False, True), 51 Resource('', 'compute', 'instance-templates', None, None, None, False, True), 52 Resource('beta', 'compute', 'network-endpoint-groups', None, None, None, True, False), 53 Resource('', 'compute', 'networks', 'subnets', 'region', None, True, True), 54 Resource('', 'compute', 'networks', None, '', None, False, True), 55 Resource('', 'compute', 'routes', None, None, None, False, True), 56 57 # logging resources 58 Resource('', 'logging', 'sinks', None, None, None, False, False), 59 ] 60 61 def log(message): 62 """ print a message if --verbose is set. """ 63 if ARGS.verbose: 64 print message 65 66 def base_command(resource): 67 """ Return the base gcloud command with api_version, group and subgroup. 68 69 Args: 70 resource: Definition of a type of gcloud resource. 71 Returns: 72 list of base commands of gcloud . 73 """ 74 75 base = ['gcloud'] 76 if resource.api_version: 77 base += [resource.api_version] 78 base += [resource.group, '-q', resource.name] 79 if resource.subgroup: 80 base.append(resource.subgroup) 81 return base 82 83 84 def validate_item(item, age, resource, clear_all): 85 """ Validate if an item need to be cleaned. 86 87 Args: 88 item: a gcloud resource item from json format. 89 age: Time cutoff from the creation of a resource. 90 resource: Definition of a type of gcloud resource. 91 clear_all: If need to clean regardless of timestamp. 92 Returns: 93 True if object need to be cleaned, False otherwise. 94 Raises: 95 ValueError if json result from gcloud is invalid. 96 """ 97 98 if resource.managed: 99 if 'isManaged' not in item: 100 raise ValueError(resource.name, resource.managed) 101 if resource.managed != item['isManaged']: 102 return False 103 104 # clears everything without checking creationTimestamp 105 if clear_all: 106 return True 107 108 if 'creationTimestamp' not in item: 109 raise ValueError('missing key: creationTimestamp - %r' % item) 110 111 # Unify datetime to use utc timezone. 112 created = datetime.datetime.strptime(item['creationTimestamp'], '%Y-%m-%dT%H:%M:%S') 113 log('Found %r(%r), %r, created time = %r' % 114 (resource.name, resource.subgroup, item['name'], item['creationTimestamp'])) 115 if created < age: 116 log('Added to janitor list: %r(%r), %r' % 117 (resource.name, resource.subgroup, item['name'])) 118 return True 119 return False 120 121 122 def collect(project, age, resource, filt, clear_all): 123 """ Collect a list of resources for each condition (zone or region). 124 125 Args: 126 project: The name of a gcp project. 127 age: Time cutoff from the creation of a resource. 128 resource: Definition of a type of gcloud resource. 129 filt: Filter clause for gcloud list command. 130 clear_all: If need to clean regardless of timestamp. 131 Returns: 132 A dict of condition : list of gcloud resource object. 133 Raises: 134 ValueError if json result from gcloud is invalid. 135 subprocess.CalledProcessError if cannot list the gcloud resource 136 """ 137 138 col = collections.defaultdict(list) 139 140 # TODO(krzyzacy): logging sink does not have timestamp 141 # don't even bother listing it if not clear_all 142 if resource.name == 'sinks' and not clear_all: 143 return col 144 145 cmd = base_command(resource) 146 cmd.extend([ 147 'list', 148 '--format=json(name,creationTimestamp.date(tz=UTC),zone,region,isManaged)', 149 '--filter=%s' % filt, 150 '--project=%s' % project]) 151 log('%r' % cmd) 152 153 # TODO(krzyzacy): work around for alpha API list calls 154 try: 155 items = subprocess.check_output(cmd) 156 except subprocess.CalledProcessError: 157 if resource.tolerate: 158 return col 159 raise 160 161 for item in json.loads(items): 162 log('parsing item: %r' % item) 163 164 if 'name' not in item: 165 raise ValueError('missing key: name - %r' % item) 166 167 if resource.condition and resource.condition in item: 168 colname = item[resource.condition] 169 log('looking for items in %s=%s' % (resource.condition, colname)) 170 else: 171 colname = '' 172 173 if validate_item(item, age, resource, clear_all): 174 col[colname].append(item['name']) 175 return col 176 177 178 def clear_resources(project, cols, resource, rate_limit): 179 """Clear a collection of resource, from collect func above. 180 181 Args: 182 project: The name of a gcp project. 183 cols: A dict of collection of resource. 184 resource: Definition of a type of gcloud resource. 185 rate_limit: how many resources to delete per gcloud delete call 186 Returns: 187 0 if no error 188 1 if deletion command fails 189 """ 190 err = 0 191 192 # delete one resource at a time, if there's no api support 193 # aka, logging sinks for example 194 if not resource.bulk_delete: 195 rate_limit = 1 196 197 for col, items in cols.items(): 198 if ARGS.dryrun: 199 log('Resource type %r(%r) to be deleted: %r' % 200 (resource.name, resource.subgroup, list(items))) 201 continue 202 203 manage_key = {'Yes':'managed', 'No':'unmanaged'} 204 205 # construct the customized gcloud command 206 base = base_command(resource) 207 if resource.managed: 208 base.append(manage_key[resource.managed]) 209 base.append('delete') 210 base.append('--project=%s' % project) 211 212 condition = None 213 if resource.condition: 214 if col: 215 condition = '--%s=%s' % (resource.condition, col) 216 else: 217 condition = '--global' 218 219 # hard code asia-southeast1-a for NEG 220 # TODO(freehan): remove this once limitation is dropped 221 if resource.name == 'network-endpoint-groups': 222 condition = '--zone=asia-southeast1-a' 223 224 log('going to delete %d %s' % (len(items), resource.name)) 225 # try to delete at most $rate_limit items at a time 226 for idx in xrange(0, len(items), rate_limit): 227 clean = items[idx:idx+rate_limit] 228 cmd = base + list(clean) 229 if condition: 230 cmd.append(condition) 231 log('Call %r' % cmd) 232 try: 233 subprocess.check_call(cmd) 234 except subprocess.CalledProcessError as exc: 235 if not resource.tolerate: 236 err = 1 237 print >>sys.stderr, 'Error try to delete resources: %r' % exc 238 return err 239 240 241 def clean_gke_cluster(project, age, filt): 242 """Clean up potential leaking gke cluster""" 243 244 # a cluster can be created in one of those three endpoints 245 endpoints = [ 246 'https://test-container.sandbox.googleapis.com/', # test 247 'https://staging-container.sandbox.googleapis.com/', # staging 248 'https://container.googleapis.com/', # prod 249 ] 250 251 err = 0 252 for endpoint in endpoints: 253 os.environ['CLOUDSDK_API_ENDPOINT_OVERRIDES_CONTAINER'] = endpoint 254 log("checking endpoint %s" % endpoint) 255 cmd = [ 256 'gcloud', 'container', '-q', 'clusters', 'list', 257 '--project=%s' % project, 258 '--filter=%s' % filt, 259 '--format=json(name,createTime,zone)' 260 ] 261 log('running %s' % cmd) 262 263 output = '' 264 try: 265 output = subprocess.check_output(cmd) 266 except subprocess.CalledProcessError as exc: 267 # expected error 268 log('Cannot reach endpoint %s with %r, continue' % (endpoint, exc)) 269 continue 270 271 for item in json.loads(output): 272 log('cluster info: %r' % item) 273 if 'name' not in item or 'createTime' not in item or 'zone' not in item: 274 raise ValueError('name, createTime and zone must present: %r' % item) 275 276 # The raw createTime string looks like 2017-08-30T18:33:14+00:00 277 # Which python 2.7 does not support timezones. 278 # Since age is already in UTC time we'll just strip the timezone part 279 item['createTime'] = item['createTime'].split('+')[0] 280 created = datetime.datetime.strptime( 281 item['createTime'], '%Y-%m-%dT%H:%M:%S') 282 283 if created < age: 284 log('Found stale gke cluster %r in %r, created time = %r' % 285 (item['name'], endpoint, item['createTime'])) 286 delete = [ 287 'gcloud', 'container', '-q', 'clusters', 'delete', 288 item['name'], 289 '--project=%s' % project, 290 '--zone=%s' % item['zone'], 291 ] 292 try: 293 log('running %s' % delete) 294 subprocess.check_call(delete) 295 except subprocess.CalledProcessError as exc: 296 err = 1 297 print >>sys.stderr, 'Error try to delete cluster %s: %r' % (item['name'], exc) 298 299 return err 300 301 def main(project, days, hours, filt, rate_limit): 302 """ Clean up resources from a gcp project based on it's creation time 303 304 Args: 305 project: The name of a gcp project. 306 days/hours: days/hours of maximum lifetime of a gcp resource. 307 filt: Resource instance filters when query. 308 Returns: 309 0 if no error 310 1 if list or delete command fails 311 """ 312 313 print '[=== Start Janitor on project %r ===]' % project 314 err = 0 315 age = datetime.datetime.utcnow() - datetime.timedelta(days=days, hours=hours) 316 clear_all = (days is 0 and hours is 0) 317 for res in DEMOLISH_ORDER: 318 log('Try to search for %r with condition %r' % (res.name, res.condition)) 319 try: 320 col = collect(project, age, res, filt, clear_all) 321 if col: 322 err |= clear_resources(project, col, res, rate_limit) 323 except (subprocess.CalledProcessError, ValueError): 324 err |= 1 # keep clean the other resource 325 print >>sys.stderr, 'Fail to list resource %r from project %r' % (res.name, project) 326 327 # try to clean leaking gke cluster 328 try: 329 err |= clean_gke_cluster(project, age, filt) 330 except ValueError: 331 err |= 1 # keep clean the other resource 332 print >>sys.stderr, 'Fail to clean up cluster from project %r' % project 333 334 print '[=== Finish Janitor on project %r with status %r ===]' % (project, err) 335 sys.exit(err) 336 337 338 if __name__ == '__main__': 339 PARSER = argparse.ArgumentParser( 340 description='Clean up resources from an expired project') 341 PARSER.add_argument('--project', help='Project to clean', required=True) 342 PARSER.add_argument( 343 '--days', type=int, 344 help='Clean items more than --days old (added to --hours)') 345 PARSER.add_argument( 346 '--hours', type=float, 347 help='Clean items more than --hours old (added to --days)') 348 PARSER.add_argument( 349 '--filter', 350 default='name !~ ^default', 351 help='Filter down to these instances') 352 PARSER.add_argument( 353 '--dryrun', 354 default=False, 355 action='store_true', 356 help='List but not delete resources') 357 PARSER.add_argument( 358 '--ratelimit', type=int, default=50, 359 help='Max number of resources to bulk clear in one gcloud delete call') 360 PARSER.add_argument( 361 '--verbose', action='store_true', 362 help='Get full janitor output log') 363 ARGS = PARSER.parse_args() 364 365 # We want to allow --days=0 and --hours=0, so check against None instead. 366 if ARGS.days is None and ARGS.hours is None: 367 print >>sys.stderr, 'must specify --days and/or --hours' 368 sys.exit(1) 369 370 main(ARGS.project, ARGS.days or 0, ARGS.hours or 0, ARGS.filter, ARGS.ratelimit)