github.com/shashidharatd/test-infra@v0.0.0-20171006011030-71304e1ca560/jenkins/hourly_maintenance.py (about) 1 #!/usr/bin/env python 2 3 # Copyright 2016 The Kubernetes Authors. 4 # 5 # Licensed under the Apache License, Version 2.0 (the "License"); 6 # you may not use this file except in compliance with the License. 7 # You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 17 """Hourly maintenance script for Jenkins agents.""" 18 19 import argparse 20 import collections 21 import datetime 22 import glob 23 import os 24 import re 25 import subprocess 26 import sys 27 import traceback 28 29 import requests 30 31 def container_images(): 32 """find all running images.""" 33 for line in subprocess.check_output([ 34 'docker', 35 'ps', '-a', 36 '--format={{.Image}}']).split('\n'): 37 if not line: 38 continue 39 yield line 40 41 42 def kill_containers(): 43 """Kill containers that have been running for a long time.""" 44 now = datetime.datetime.now() 45 old = [] 46 for line in subprocess.check_output([ 47 'docker', 'ps', '-a', 48 '-f', 'status=running', 49 '--format={{.CreatedAt}}\t{{.ID}}', 50 ]).split('\n'): 51 if not line: 52 continue 53 created, name = line.split('\t') 54 fmt = 'YYYY-MM-dd HH:MM' 55 created = datetime.datetime.strptime(created[:len(fmt)], '%Y-%m-%d %H:%M') 56 if now - created > datetime.timedelta(days=1): 57 old.append(name) 58 59 if not old: 60 return 0 61 62 print 'Old running containers to kill:', old 63 err = subprocess.call(['docker', 'kill'] + old) 64 if err: 65 print >>sys.stderr, 'kill_containers failed' 66 return err 67 68 69 def remove_containers(): 70 """Remove non-running containers that we created a long time ago.""" 71 now = datetime.datetime.now() 72 old = [] 73 for line in subprocess.check_output([ 74 'docker', 75 'ps', '-a', 76 '-f', 'status=created', # Never started due to timeout 77 '-f', 'status=exited', # Container exited 78 '-f', 'status=dead', # Zombie container 79 '--format={{.CreatedAt}}\t{{.ID}}\t{{.Image}}', 80 ]).split('\n'): 81 if not line: 82 continue 83 created, name, _image = line.split('\t') 84 fmt = 'YYYY-mm-dd HH:MM' 85 created = datetime.datetime.strptime(created[:len(fmt)], '%Y-%m-%d %H:%M') 86 if now - created > datetime.timedelta(hours=2): 87 old.append(name) 88 89 if not old: 90 return 0 91 92 print 'Old non-running containers to remove:', old 93 err = subprocess.call(['docker', 'rm', '-v'] + old) 94 if err: 95 print >>sys.stderr, 'remove_containers failed' 96 return err 97 98 99 def remove_images(skip, ancient): 100 """Remove all tagged images except the most recently downloaded one.""" 101 tags = collections.defaultdict(list) 102 images = subprocess.check_output(['docker', 'images']) 103 104 for line in images.split('\n'): 105 if not line: 106 continue 107 name, tag, _unused, age = re.split(r'\s+', line.strip())[:4] 108 if 'minutes' in age: 109 continue 110 if 'hour' in age and 'hours' not in age: 111 continue 112 if '%s:%s' % (name, tag) in skip: 113 continue 114 tags[name].append(tag) 115 if ancient and ('weeks' in age or 'months' in age): 116 tags[name].append(tag) # Always delete ancient images 117 118 err = 0 119 for name, versions in tags.items(): 120 if name == '<none>': 121 continue 122 if len(versions) < 2: 123 continue 124 untag = ['%s:%s' % (name, v) for v in set(versions[1:])] 125 print 'Remove %d %s images: %s' % (len(untag), name, untag) 126 err |= subprocess.call(['docker', 'rmi'] + untag) 127 128 dangling = subprocess.check_output([ 129 'docker', 'images', '-q', '-f', 'dangling=true']) 130 if dangling: 131 err |= subprocess.call(['docker', 'rmi', '-f'] + dangling.split()) 132 133 if err: 134 print >>sys.stderr, 'remove_images failed' 135 return err 136 137 138 def remove_volumes(): 139 """Run docker cleanup volumes.""" 140 err = subprocess.call([ 141 'docker', 'run', 142 '-v', '/var/run/docker.sock:/var/run/docker.sock', 143 '-v', '/var/lib/docker:/var/lib/docker', 144 '--rm', 'martin/docker-cleanup-volumes']) 145 if err: 146 print >>sys.stderr, 'remove_volumes failed' 147 return err 148 149 150 def kill_looping_bash(): 151 """kill hanging scripts.""" 152 err = 0 153 bash_procs = subprocess.check_output(['pgrep', '-f', '^(/bin/)?bash']).split() 154 155 clock_hz = os.sysconf(os.sysconf_names['SC_CLK_TCK']) 156 for pid in bash_procs: 157 # man 5 proc 158 with open('/proc/%s/stat' % pid) as fp: 159 stat = fp.read().split() 160 utime = int(stat[13]) / clock_hz 161 utime_minutes = utime / 60 162 if utime_minutes > 30: 163 with open('/proc/%s/cmdline' % pid) as fp: 164 cmdline = fp.read().replace('\x00', ' ').strip() 165 print "killing bash pid %s (%r) with %d minutes of CPU time" % ( 166 pid, cmdline, utime_minutes) 167 print 'Environment variables:' 168 environ = subprocess.check_output(['sudo', 'cat', '/proc/%s/environ' % pid]) 169 print '\n'.join(sorted(environ.split('\x00'))) 170 err |= subprocess.call(['sudo', 'kill', '-9', pid]) 171 return err 172 173 174 def delete_corrupt_git_repos(): 175 """ 176 Find and delete corrupt .git directories. This can occur when the agent 177 reboots in the middle of a git operation. This is *still* less flaky than doing 178 full clones every time and occasionally timing out because GitHub is throttling us :( 179 180 Git complains with things like this: 181 182 error: object file ws/.git/objects/01/e6eeca... is empty 183 fatal: loose object 01e6eeca211171e9ae5117bbeed738218d2cdb09 184 (stored in ws/.git/objects/01/e6eeca..) is corrupt 185 """ 186 # TODO(rmmh): find a way to run this on boot for each jenkins agent, to 187 # clean up corrupted git directories before a job can trip over them. 188 err = 0 189 for git_dir in glob.glob('/var/lib/jenkins/workspace/*/.git'): 190 if not subprocess.check_output(['find', git_dir, '-size', '0']): 191 # git fsck is kind of slow (~30s each), this fast heuristic speeds things up. 192 continue 193 print 'validating git dir:', git_dir 194 corrupt = subprocess.call(['git', '--git-dir', git_dir, 'fsck']) 195 err |= corrupt # flag 196 if err: 197 print 'deleting corrupt git dir' 198 err |= subprocess.call(['rm', '-rf', git_dir]) 199 return err 200 201 202 def delete_old_workspaces(): 203 """delete old workspace dirs.""" 204 err = 0 205 live_jobs = None 206 for host in ('jenkins-master', 'pull-jenkins-master'): 207 try: 208 resp = requests.get("http://%s:8080/api/json?pretty=true&tree=jobs[name]" % host) 209 resp.raise_for_status() 210 live_jobs = {job['name'] for job in resp.json()['jobs']} 211 except requests.exceptions.ConnectionError: 212 continue 213 except requests.exceptions.RequestException: 214 traceback.print_exc() 215 if live_jobs is None: 216 print 'unable to determine live jenkins jobs, not deleting any workspaces' 217 return 1 218 for dirname in sorted(glob.glob('/var/lib/jenkins/workspace/*')): 219 key = os.path.basename(dirname).replace('@tmp', '') 220 if key not in live_jobs: 221 print 'deleting old job workspace', dirname 222 err |= subprocess.call(['sudo', 'rm', '-rf', dirname]) 223 return err 224 225 226 def main(ancient): 227 """Run maintenance.""" 228 # Copied from http://blog.yohanliyanage.com/2015/05/docker-clean-up-after-yourself/ 229 err = 0 230 err |= kill_containers() 231 err |= remove_containers() 232 err |= remove_images(set(container_images()), ancient) 233 err |= remove_volumes() 234 err |= kill_looping_bash() 235 err |= delete_corrupt_git_repos() 236 err |= delete_old_workspaces() 237 sys.exit(err) 238 239 240 if __name__ == '__main__': 241 PARSER = argparse.ArgumentParser( 242 description='Run hourly maintenance on jenkins agents') 243 PARSER.add_argument('--ancient', action='store_true', help='Delete all old images') 244 ARGS = PARSER.parse_args() 245 main(ARGS.ancient)