k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/boskos/cmd/janitor/gcp_janitor.py (about) 1 #!/usr/bin/env python3 2 3 # Copyright 2016 The Kubernetes Authors. 4 # 5 # Licensed under the Apache License, Version 2.0 (the "License"); 6 # you may not use this file except in compliance with the License. 7 # You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 17 """Clean up resources from gcp projects. """ 18 19 import argparse 20 import collections 21 import datetime 22 import json 23 import os 24 import subprocess 25 import sys 26 import threading 27 28 # A resource that need to be cleared. 29 Resource = collections.namedtuple( 30 'Resource', 'api_version group name subgroup condition managed tolerate bulk_delete') 31 DEMOLISH_ORDER = [ 32 # [WARNING FROM KRZYZACY] : TOUCH THIS WITH CARE! 33 # ORDER REALLY MATTERS HERE! 34 35 # compute resources 36 Resource('', 'compute', 'instances', None, 'zone', None, False, True), 37 Resource('', 'compute', 'addresses', None, 'global', None, False, True), 38 Resource('', 'compute', 'addresses', None, 'region', None, False, True), 39 Resource('', 'compute', 'disks', None, 'zone', None, False, True), 40 Resource('', 'compute', 'disks', None, 'region', None, False, True), 41 Resource('', 'compute', 'firewall-rules', None, None, None, False, True), 42 Resource('', 'compute', 'forwarding-rules', None, 'global', None, False, True), 43 Resource('', 'compute', 'forwarding-rules', None, 'region', None, False, True), 44 Resource('', 'compute', 'target-http-proxies', None, 'global', None, False, True), 45 Resource('', 'compute', 'target-http-proxies', None, 'region', None, False, True), 46 Resource('', 'compute', 'target-https-proxies', None, 'global', None, False, True), 47 Resource('', 'compute', 'target-https-proxies', None, 'region', None, False, True), 48 Resource('', 'compute', 'target-tcp-proxies', None, None, None, False, True), 49 Resource('', 'compute', 'ssl-certificates', None, 'global', None, False, True), 50 Resource('', 'compute', 'ssl-certificates', None, 'region', None, False, True), 51 Resource('', 'compute', 'url-maps', None, 'global', None, False, True), 52 Resource('', 'compute', 'url-maps', None, 'region', None, False, True), 53 Resource('', 'compute', 'backend-services', None, 'global', None, False, True), 54 Resource('', 'compute', 'backend-services', None, 'region', None, False, True), 55 Resource('', 'compute', 'target-pools', None, 'region', None, False, True), 56 Resource('', 'compute', 'health-checks', None, 'global', None, False, True), 57 Resource('', 'compute', 'health-checks', None, 'region', None, False, True), 58 Resource('', 'compute', 'http-health-checks', None, None, None, False, True), 59 Resource('', 'compute', 'instance-groups', None, 'region', 'Yes', False, True), 60 Resource('', 'compute', 'instance-groups', None, 'zone', 'Yes', False, True), 61 Resource('', 'compute', 'instance-groups', None, 'zone', 'No', False, True), 62 Resource('', 'compute', 'instance-templates', None, None, None, False, True), 63 Resource('', 'compute', 'sole-tenancy', 'node-groups', 'zone', None, False, True), 64 Resource('', 'compute', 'sole-tenancy', 'node-templates', 'region', None, False, True), 65 Resource('', 'compute', 'network-endpoint-groups', None, 'zone', None, False, False), 66 Resource('', 'compute', 'routes', None, None, None, False, True), 67 Resource('', 'compute', 'routers', None, 'region', None, False, True), 68 Resource('', 'compute', 'networks', 'subnets', 'region', None, True, True), 69 Resource('', 'compute', 'networks', None, None, None, False, True), 70 71 # logging resources 72 Resource('', 'logging', 'sinks', None, None, None, False, False), 73 ] 74 75 76 def log(message): 77 """ print a message if --verbose is set. """ 78 if ARGS.verbose: 79 tss = "[" + str(datetime.datetime.now()) + "] " 80 print(tss + message + '\n') 81 82 83 def base_command(resource): 84 """ Return the base gcloud command with api_version, group and subgroup. 85 86 Args: 87 resource: Definition of a type of gcloud resource. 88 Returns: 89 list of base commands of gcloud . 90 """ 91 92 base = ['gcloud'] 93 if resource.api_version: 94 base += [resource.api_version] 95 base += [resource.group, '-q', resource.name] 96 if resource.subgroup: 97 base.append(resource.subgroup) 98 return base 99 100 101 def validate_item(item, age, resource, clear_all): 102 """ Validate if an item need to be cleaned. 103 104 Args: 105 item: a gcloud resource item from json format. 106 age: Time cutoff from the creation of a resource. 107 resource: Definition of a type of gcloud resource. 108 clear_all: If need to clean regardless of timestamp. 109 Returns: 110 True if object need to be cleaned, False otherwise. 111 Raises: 112 ValueError if json result from gcloud is invalid. 113 """ 114 115 if resource.managed: 116 if 'isManaged' not in item: 117 raise ValueError(resource.name, resource.managed) 118 if resource.managed != item['isManaged']: 119 return False 120 121 # clears everything without checking creationTimestamp 122 if clear_all: 123 return True 124 125 if 'creationTimestamp' not in item: 126 raise ValueError('missing key: creationTimestamp - %r' % item) 127 128 # Unify datetime to use utc timezone. 129 created = datetime.datetime.strptime(item['creationTimestamp'], '%Y-%m-%dT%H:%M:%S') 130 log('Found %r(%r), %r, created time = %r' % 131 (resource.name, resource.subgroup, item['name'], item['creationTimestamp'])) 132 if created < age: 133 log('Added to janitor list: %r(%r), %r' % 134 (resource.name, resource.subgroup, item['name'])) 135 return True 136 return False 137 138 139 def collect(project, age, resource, filt, clear_all): 140 """ Collect a list of resources for each condition (zone or region). 141 142 Args: 143 project: The name of a gcp project. 144 age: Time cutoff from the creation of a resource. 145 resource: Definition of a type of gcloud resource. 146 filt: Filter clause for gcloud list command. 147 clear_all: If need to clean regardless of timestamp. 148 Returns: 149 A dict of condition : list of gcloud resource object. 150 Raises: 151 ValueError if json result from gcloud is invalid. 152 subprocess.CalledProcessError if cannot list the gcloud resource 153 """ 154 155 col = collections.defaultdict(list) 156 157 # TODO(krzyzacy): logging sink does not have timestamp 158 # don't even bother listing it if not clear_all 159 if resource.name == 'sinks' and not clear_all: 160 return col 161 162 cmd = base_command(resource) 163 cmd.extend([ 164 'list', 165 '--format=json(name,creationTimestamp.date(tz=UTC),zone,region,isManaged)', 166 '--filter=%s' % filt, 167 '--project=%s' % project]) 168 if resource.condition == 'zone' and resource.name != 'sole-tenancy' and resource.name != 'network-endpoint-groups': 169 cmd.append('--zones=asia-east1-a,asia-east1-b,asia-east1-c,asia-east2-a,asia-east2-b,asia-east2-c,' + 170 'asia-northeast1-a,asia-northeast1-b,asia-northeast1-c,asia-northeast2-a,asia-northeast2-b,asia-northeast2-c,' + 171 'asia-northeast3-a,asia-northeast3-b,asia-northeast3-c,asia-south1-a,asia-south1-b,asia-south1-c,' + 172 'asia-southeast1-a,asia-southeast1-b,asia-southeast1-c,australia-southeast1-a,australia-southeast1-b,' + 173 'australia-southeast1-c,europe-north1-a,europe-north1-b,europe-north1-c,europe-west1-b,europe-west1-c,' + 174 'europe-west1-d,europe-west2-a,europe-west2-b,europe-west2-c,europe-west3-a,europe-west3-b,europe-west3-c,' + 175 'europe-west4-a,europe-west4-b,europe-west4-c,europe-west6-a,europe-west6-b,europe-west6-c,' + 176 'northamerica-northeast1-a,northamerica-northeast1-b,northamerica-northeast1-c,southamerica-east1-a,' + 177 'southamerica-east1-b,southamerica-east1-c,us-central1-a,us-central1-b,us-central1-c,us-central1-f,' + 178 'us-east1-b,us-east1-c,us-east1-d,us-east4-a,us-east4-b,us-east4-c,us-west1-a,us-west1-b,us-west1-c,' + 179 'us-west2-a,us-west2-b,us-west2-c,us-west3-a,us-west3-b,us-west3-c') 180 log('%r' % cmd) 181 182 # TODO(krzyzacy): work around for alpha API list calls 183 try: 184 items = subprocess.check_output(cmd) 185 except subprocess.CalledProcessError: 186 if resource.tolerate: 187 return col 188 raise 189 190 for item in json.loads(items): 191 log('parsing item: %r' % item) 192 193 if 'name' not in item: 194 raise ValueError('missing key: name - %r' % item) 195 196 colname = '' 197 if resource.condition is not None: 198 # This subcommand will want either a --global, --region, or --zone 199 # flag, so segment items accordingly. 200 if resource.condition == 'global': 201 if 'zone' in item or 'region' in item: 202 # This item is zonal or regional, so don't include it in 203 # the global list. 204 continue 205 elif resource.condition in item: 206 # Looking for zonal or regional items, and this matches. 207 # The zone or region is sometimes a full URL (why?), but 208 # subcommands want just the name, not the full URL, so strip it. 209 colname = item[resource.condition].rsplit('/', 1)[-1] 210 log('looking for items in %s=%s' % (resource.condition, colname)) 211 else: 212 # This item doesn't match the condition, so don't include it. 213 continue 214 215 if validate_item(item, age, resource, clear_all): 216 col[colname].append(item['name']) 217 return col 218 219 def asyncCall(cmd, tolerate, name, errs, lock, hide_output): 220 log('%sCall %r' % ('[DRYRUN] ' if ARGS.dryrun else '', cmd)) 221 if ARGS.dryrun: 222 return 223 try: 224 if hide_output: 225 FNULL = open(os.devnull, 'w') 226 subprocess.check_call(cmd, stdout=FNULL) 227 else: 228 subprocess.check_call(cmd) 229 except subprocess.CalledProcessError as exc: 230 if not tolerate: 231 with lock: 232 errs.append(exc) 233 print('Error try to delete resources %s: %r' % (name, exc), file=sys.stderr) 234 235 def clear_resources(project, cols, resource, rate_limit): 236 """Clear a collection of resource, from collect func above. 237 238 Args: 239 project: The name of a gcp project. 240 cols: A dict of collection of resource. 241 resource: Definition of a type of gcloud resource. 242 rate_limit: how many resources to delete per gcloud delete call 243 Returns: 244 0 if no error 245 > 0 if deletion command fails 246 """ 247 errs = [] 248 threads = list() 249 lock = threading.Lock() 250 251 # delete one resource at a time, if there's no api support 252 # aka, logging sinks for example 253 if not resource.bulk_delete: 254 rate_limit = 1 255 256 for col, items in list(cols.items()): 257 manage_key = {'Yes': 'managed', 'No': 'unmanaged'} 258 259 # construct the customized gcloud command 260 base = base_command(resource) 261 if resource.managed: 262 base.append(manage_key[resource.managed]) 263 base.append('delete') 264 base.append('--project=%s' % project) 265 266 condition = None 267 if resource.condition and col: 268 condition = '--%s=%s' % (resource.condition, col) 269 elif resource.condition == 'global': 270 condition = '--global' 271 272 log('going to delete %d %s' % (len(items), resource.name)) 273 # try to delete at most $rate_limit items at a time 274 for idx in range(0, len(items), rate_limit): 275 clean = items[idx:idx + rate_limit] 276 cmd = base + list(clean) 277 if condition: 278 cmd.append(condition) 279 thread = threading.Thread( 280 target=asyncCall, args=(cmd, resource.tolerate, resource.name, errs, lock, False)) 281 threads.append(thread) 282 log('start a new thread, total %d' % len(threads)) 283 thread.start() 284 285 log('Waiting for all %d thread to finish' % len(threads)) 286 for thread in threads: 287 thread.join() 288 return len(errs) 289 290 291 def clean_gke_cluster(project, age, filt): 292 """Clean up potential leaking gke cluster""" 293 294 # a cluster can be created in one of those three endpoints 295 endpoints = [ 296 'https://test-container.sandbox.googleapis.com/', # test 297 'https://staging-container.sandbox.googleapis.com/', # staging 298 'https://staging2-container.sandbox.googleapis.com/', # staging2 299 'https://container.googleapis.com/', # prod 300 ] 301 302 errs = [] 303 304 for endpoint in endpoints: 305 threads = list() 306 lock = threading.Lock() 307 308 os.environ['CLOUDSDK_API_ENDPOINT_OVERRIDES_CONTAINER'] = endpoint 309 log("checking endpoint %s" % endpoint) 310 cmd = [ 311 'gcloud', 'container', '-q', 'clusters', 'list', 312 '--project=%s' % project, 313 '--filter=%s' % filt, 314 '--format=json(name,createTime,region,zone)' 315 ] 316 log('running %s' % cmd) 317 318 output = '' 319 try: 320 output = subprocess.check_output(cmd) 321 except subprocess.CalledProcessError as exc: 322 # expected error 323 log('Cannot reach endpoint %s with %r, continue' % (endpoint, exc)) 324 continue 325 326 for item in json.loads(output): 327 log('cluster info: %r' % item) 328 if 'name' not in item or 'createTime' not in item: 329 raise ValueError('name and createTime must be present: %r' % item) 330 if not ('zone' in item or 'region' in item): 331 raise ValueError('either zone or region must be present: %r' % item) 332 333 # The raw createTime string looks like 2017-08-30T18:33:14+00:00 334 # Which python 2.7 does not support timezones. 335 # Since age is already in UTC time we'll just strip the timezone part 336 item['createTime'] = item['createTime'].split('+')[0] 337 created = datetime.datetime.strptime( 338 item['createTime'], '%Y-%m-%dT%H:%M:%S') 339 340 if created < age: 341 log('Found stale gke cluster %r in %r, created time = %r' % 342 (item['name'], endpoint, item['createTime'])) 343 delete = [ 344 'gcloud', 'container', '-q', 'clusters', 'delete', 345 item['name'], 346 '--project=%s' % project, 347 ] 348 if 'zone' in item: 349 delete.append('--zone=%s' % item['zone']) 350 elif 'region' in item: 351 delete.append('--region=%s' % item['region']) 352 thread = threading.Thread( 353 target=asyncCall, args=(delete, False, item['name'], errs, lock, True)) 354 threads.append(thread) 355 log('start a new thread, total %d' % len(threads)) 356 thread.start() 357 358 log('Waiting for all %d thread to finish in %s' % (len(threads), endpoint)) 359 for thread in threads: 360 thread.join() 361 362 return len(errs) > 0 363 364 365 def activate_service_account(service_account): 366 print('[=== Activating service_account %s ===]' % service_account) 367 cmd = [ 368 'gcloud', 'auth', 'activate-service-account', 369 '--key-file=%s' % service_account, 370 ] 371 log('running %s' % cmd) 372 373 try: 374 subprocess.check_call(cmd) 375 except subprocess.CalledProcessError: 376 print('Error try to activate service_account: %s' % service_account, file=sys.stderr) 377 return 1 378 return 0 379 380 381 def main(project, days, hours, filt, rate_limit, service_account): 382 """ Clean up resources from a gcp project based on it's creation time 383 384 Args: 385 project: The name of a gcp project. 386 days/hours: days/hours of maximum lifetime of a gcp resource. 387 filt: Resource instance filters when query. 388 Returns: 389 0 if no error 390 1 if list or delete command fails 391 """ 392 393 print('[=== Start Janitor on project %r ===]' % project) 394 err = 0 395 age = datetime.datetime.utcnow() - datetime.timedelta(days=days, hours=hours) 396 clear_all = (days == 0 and hours == 0) 397 398 if service_account: 399 err |= activate_service_account(service_account) 400 if err: 401 print('Failed to activate service account %r' % ( 402 service_account), file=sys.stderr) 403 sys.exit(err) 404 405 # try to clean a leaked GKE cluster first, rather than attempting to delete 406 # its associated resources individually. 407 try: 408 err |= clean_gke_cluster(project, age, filt) 409 except ValueError: 410 err |= 1 # keep clean the other resource 411 print('Fail to clean up cluster from project %r' % project, file=sys.stderr) 412 413 for res in DEMOLISH_ORDER: 414 log('Try to search for %r with condition %r, managed %r' % ( 415 res.name, res.condition, res.managed)) 416 try: 417 col = collect(project, age, res, filt, clear_all) 418 if col: 419 err |= clear_resources(project, col, res, rate_limit) 420 except (subprocess.CalledProcessError, ValueError): 421 err |= 1 # keep clean the other resource 422 print('Fail to list resource %r from project %r' % ( 423 res.name, project), file=sys.stderr) 424 425 print('[=== Finish Janitor on project %r with status %r ===]' % (project, err)) 426 sys.exit(err) 427 428 429 if __name__ == '__main__': 430 PARSER = argparse.ArgumentParser( 431 description='Clean up resources from an expired project') 432 PARSER.add_argument('--project', help='Project to clean', required=True) 433 PARSER.add_argument( 434 '--days', type=int, 435 help='Clean items more than --days old (added to --hours)') 436 PARSER.add_argument( 437 '--hours', type=float, 438 help='Clean items more than --hours old (added to --days)') 439 PARSER.add_argument( 440 '--filter', 441 default='name !~ ^default', 442 help='Filter down to these instances') 443 PARSER.add_argument( 444 '--dryrun', 445 default=False, 446 action='store_true', 447 help='List but not delete resources') 448 PARSER.add_argument( 449 '--ratelimit', type=int, default=50, 450 help='Max number of resources to bulk clear in one gcloud delete call') 451 PARSER.add_argument( 452 '--verbose', action='store_true', 453 help='Get full janitor output log') 454 PARSER.add_argument( 455 '--service_account', 456 help='GCP service account', 457 default=os.environ.get("GOOGLE_APPLICATION_CREDENTIALS", None)) 458 ARGS = PARSER.parse_args() 459 460 # We want to allow --days=0 and --hours=0, so check against None instead. 461 if ARGS.days is None and ARGS.hours is None: 462 print('must specify --days and/or --hours', file=sys.stderr) 463 sys.exit(1) 464 465 main(ARGS.project, ARGS.days or 0, ARGS.hours or 0, ARGS.filter, 466 ARGS.ratelimit, ARGS.service_account)