github.com/shashidharatd/test-infra@v0.0.0-20171006011030-71304e1ca560/jenkins/hourly_maintenance.py (about)

     1  #!/usr/bin/env python
     2  
     3  # Copyright 2016 The Kubernetes Authors.
     4  #
     5  # Licensed under the Apache License, Version 2.0 (the "License");
     6  # you may not use this file except in compliance with the License.
     7  # You may obtain a copy of the License at
     8  #
     9  #     http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  
    17  """Hourly maintenance script for Jenkins agents."""
    18  
    19  import argparse
    20  import collections
    21  import datetime
    22  import glob
    23  import os
    24  import re
    25  import subprocess
    26  import sys
    27  import traceback
    28  
    29  import requests
    30  
    31  def container_images():
    32      """find all running images."""
    33      for line in subprocess.check_output([
    34              'docker',
    35              'ps', '-a',
    36              '--format={{.Image}}']).split('\n'):
    37          if not line:
    38              continue
    39          yield line
    40  
    41  
    42  def kill_containers():
    43      """Kill containers that have been running for a long time."""
    44      now = datetime.datetime.now()
    45      old = []
    46      for line in subprocess.check_output([
    47              'docker', 'ps', '-a',
    48              '-f', 'status=running',
    49              '--format={{.CreatedAt}}\t{{.ID}}',
    50      ]).split('\n'):
    51          if not line:
    52              continue
    53          created, name = line.split('\t')
    54          fmt = 'YYYY-MM-dd HH:MM'
    55          created = datetime.datetime.strptime(created[:len(fmt)], '%Y-%m-%d %H:%M')
    56          if now - created > datetime.timedelta(days=1):
    57              old.append(name)
    58  
    59      if not old:
    60          return 0
    61  
    62      print 'Old running containers to kill:', old
    63      err = subprocess.call(['docker', 'kill'] + old)
    64      if err:
    65          print >>sys.stderr, 'kill_containers failed'
    66      return err
    67  
    68  
    69  def remove_containers():
    70      """Remove non-running containers that we created a long time ago."""
    71      now = datetime.datetime.now()
    72      old = []
    73      for line in subprocess.check_output([
    74              'docker',
    75              'ps', '-a',
    76              '-f', 'status=created',  # Never started due to timeout
    77              '-f', 'status=exited',  # Container exited
    78              '-f', 'status=dead',  # Zombie container
    79              '--format={{.CreatedAt}}\t{{.ID}}\t{{.Image}}',
    80      ]).split('\n'):
    81          if not line:
    82              continue
    83          created, name, _image = line.split('\t')
    84          fmt = 'YYYY-mm-dd HH:MM'
    85          created = datetime.datetime.strptime(created[:len(fmt)], '%Y-%m-%d %H:%M')
    86          if now - created > datetime.timedelta(hours=2):
    87              old.append(name)
    88  
    89      if not old:
    90          return 0
    91  
    92      print 'Old non-running containers to remove:', old
    93      err = subprocess.call(['docker', 'rm', '-v'] + old)
    94      if err:
    95          print >>sys.stderr, 'remove_containers failed'
    96      return err
    97  
    98  
    99  def remove_images(skip, ancient):
   100      """Remove all tagged images except the most recently downloaded one."""
   101      tags = collections.defaultdict(list)
   102      images = subprocess.check_output(['docker', 'images'])
   103  
   104      for line in images.split('\n'):
   105          if not line:
   106              continue
   107          name, tag, _unused, age = re.split(r'\s+', line.strip())[:4]
   108          if 'minutes' in age:
   109              continue
   110          if 'hour' in age and 'hours' not in age:
   111              continue
   112          if '%s:%s' % (name, tag) in skip:
   113              continue
   114          tags[name].append(tag)
   115          if ancient and ('weeks' in age or 'months' in age):
   116              tags[name].append(tag)  # Always delete ancient images
   117  
   118      err = 0
   119      for name, versions in tags.items():
   120          if name == '<none>':
   121              continue
   122          if len(versions) < 2:
   123              continue
   124          untag = ['%s:%s' % (name, v) for v in set(versions[1:])]
   125          print 'Remove %d %s images: %s' % (len(untag), name, untag)
   126          err |= subprocess.call(['docker', 'rmi'] + untag)
   127  
   128      dangling = subprocess.check_output([
   129          'docker', 'images', '-q', '-f', 'dangling=true'])
   130      if dangling:
   131          err |= subprocess.call(['docker', 'rmi', '-f'] + dangling.split())
   132  
   133      if err:
   134          print >>sys.stderr, 'remove_images failed'
   135      return err
   136  
   137  
   138  def remove_volumes():
   139      """Run docker cleanup volumes."""
   140      err = subprocess.call([
   141          'docker', 'run',
   142          '-v', '/var/run/docker.sock:/var/run/docker.sock',
   143          '-v', '/var/lib/docker:/var/lib/docker',
   144          '--rm', 'martin/docker-cleanup-volumes'])
   145      if err:
   146          print >>sys.stderr, 'remove_volumes failed'
   147      return err
   148  
   149  
   150  def kill_looping_bash():
   151      """kill hanging scripts."""
   152      err = 0
   153      bash_procs = subprocess.check_output(['pgrep', '-f', '^(/bin/)?bash']).split()
   154  
   155      clock_hz = os.sysconf(os.sysconf_names['SC_CLK_TCK'])
   156      for pid in bash_procs:
   157          # man 5 proc
   158          with open('/proc/%s/stat' % pid) as fp:
   159              stat = fp.read().split()
   160          utime = int(stat[13]) / clock_hz
   161          utime_minutes = utime / 60
   162          if utime_minutes > 30:
   163              with open('/proc/%s/cmdline' % pid) as fp:
   164                  cmdline = fp.read().replace('\x00', ' ').strip()
   165              print "killing bash pid %s (%r) with %d minutes of CPU time" % (
   166                  pid, cmdline, utime_minutes)
   167              print 'Environment variables:'
   168              environ = subprocess.check_output(['sudo', 'cat', '/proc/%s/environ' % pid])
   169              print '\n'.join(sorted(environ.split('\x00')))
   170              err |= subprocess.call(['sudo', 'kill', '-9', pid])
   171      return err
   172  
   173  
   174  def delete_corrupt_git_repos():
   175      """
   176      Find and delete corrupt .git directories. This can occur when the agent
   177      reboots in the middle of a git operation. This is *still* less flaky than doing
   178      full clones every time and occasionally timing out because GitHub is throttling us :(
   179  
   180      Git complains with things like this:
   181  
   182      error: object file ws/.git/objects/01/e6eeca... is empty
   183      fatal: loose object 01e6eeca211171e9ae5117bbeed738218d2cdb09
   184          (stored in ws/.git/objects/01/e6eeca..) is corrupt
   185      """
   186      # TODO(rmmh): find a way to run this on boot for each jenkins agent, to
   187      # clean up corrupted git directories before a job can trip over them.
   188      err = 0
   189      for git_dir in glob.glob('/var/lib/jenkins/workspace/*/.git'):
   190          if not subprocess.check_output(['find', git_dir, '-size', '0']):
   191              # git fsck is kind of slow (~30s each), this fast heuristic speeds things up.
   192              continue
   193          print 'validating git dir:', git_dir
   194          corrupt = subprocess.call(['git', '--git-dir', git_dir, 'fsck'])
   195          err |= corrupt  # flag
   196          if err:
   197              print 'deleting corrupt git dir'
   198              err |= subprocess.call(['rm', '-rf', git_dir])
   199      return err
   200  
   201  
   202  def delete_old_workspaces():
   203      """delete old workspace dirs."""
   204      err = 0
   205      live_jobs = None
   206      for host in ('jenkins-master', 'pull-jenkins-master'):
   207          try:
   208              resp = requests.get("http://%s:8080/api/json?pretty=true&tree=jobs[name]" % host)
   209              resp.raise_for_status()
   210              live_jobs = {job['name'] for job in resp.json()['jobs']}
   211          except requests.exceptions.ConnectionError:
   212              continue
   213          except requests.exceptions.RequestException:
   214              traceback.print_exc()
   215      if live_jobs is None:
   216          print 'unable to determine live jenkins jobs, not deleting any workspaces'
   217          return 1
   218      for dirname in sorted(glob.glob('/var/lib/jenkins/workspace/*')):
   219          key = os.path.basename(dirname).replace('@tmp', '')
   220          if key not in live_jobs:
   221              print 'deleting old job workspace', dirname
   222              err |= subprocess.call(['sudo', 'rm', '-rf', dirname])
   223      return err
   224  
   225  
   226  def main(ancient):
   227      """Run maintenance."""
   228      # Copied from http://blog.yohanliyanage.com/2015/05/docker-clean-up-after-yourself/
   229      err = 0
   230      err |= kill_containers()
   231      err |= remove_containers()
   232      err |= remove_images(set(container_images()), ancient)
   233      err |= remove_volumes()
   234      err |= kill_looping_bash()
   235      err |= delete_corrupt_git_repos()
   236      err |= delete_old_workspaces()
   237      sys.exit(err)
   238  
   239  
   240  if __name__ == '__main__':
   241      PARSER = argparse.ArgumentParser(
   242          description='Run hourly maintenance on jenkins agents')
   243      PARSER.add_argument('--ancient', action='store_true', help='Delete all old images')
   244      ARGS = PARSER.parse_args()
   245      main(ARGS.ancient)