github.com/blystad/deis@v0.11.0/controller/scheduler/coreos.py

github.com/blystad/deis@v0.11.0/controller/scheduler/coreos.py (about)

     1  from cStringIO import StringIO
     2  import base64
     3  import os
     4  import random
     5  import re
     6  import subprocess
     7  import time
     8  
     9  
    10  ROOT_DIR = os.path.join(os.getcwd(), 'coreos')
    11  if not os.path.exists(ROOT_DIR):
    12      os.mkdir(ROOT_DIR)
    13  
    14  MATCH = re.compile(
    15      '(?P<app>[a-z0-9-]+)_?(?P<version>v[0-9]+)?\.?(?P<c_type>[a-z]+)?.(?P<c_num>[0-9]+)')
    16  
    17  
    18  class FleetClient(object):
    19  
    20      def __init__(self, cluster_name, hosts, auth, domain, options):
    21          self.name = cluster_name
    22          self.hosts = hosts
    23          self.domain = domain
    24          self.options = options
    25          self.auth = auth
    26          self.auth_path = os.path.join(ROOT_DIR, 'ssh-{cluster_name}'.format(**locals()))
    27          with open(self.auth_path, 'w') as f:
    28              f.write(base64.b64decode(auth))
    29              os.chmod(self.auth_path, 0600)
    30  
    31          self.env = {
    32              'PATH': '/usr/local/bin:/usr/bin:/bin:{}'.format(
    33                  os.path.abspath(os.path.join(__file__, '..'))),
    34              'FLEETW_KEY': self.auth_path,
    35              'FLEETW_HOST': random.choice(self.hosts.split(','))}
    36  
    37      # scheduler setup / teardown
    38  
    39      def setUp(self):
    40          """
    41          Setup a CoreOS cluster including router and log aggregator
    42          """
    43          return
    44  
    45      def tearDown(self):
    46          """
    47          Tear down a CoreOS cluster including router and log aggregator
    48          """
    49          return
    50  
    51      # announcer helpers
    52  
    53      def _log_skipped_announcer(self, action, name):
    54          """
    55          Logs a message stating that this operation doesn't require an announcer
    56          """
    57          print "-- skipping announcer {} for {}".format(action, name)
    58  
    59      # job api
    60  
    61      def create(self, name, image, command='', template=None, use_announcer=True, **kwargs):
    62          """
    63          Create a new job
    64          """
    65          print 'Creating {name}'.format(**locals())
    66          env = self.env.copy()
    67          self._create_container(name, image, command, template or CONTAINER_TEMPLATE, env, **kwargs)
    68          self._create_log(name, image, command, LOG_TEMPLATE, env)
    69  
    70          if use_announcer:
    71              self._create_announcer(name, image, command, ANNOUNCE_TEMPLATE, env)
    72          else:
    73              self._log_skipped_announcer('create', name)
    74  
    75      def _create_container(self, name, image, command, template, env, **kwargs):
    76          l = locals().copy()
    77          l.update(re.match(MATCH, name).groupdict())
    78          # prepare memory limit for the container type
    79          mem = kwargs.get('memory', {}).get(l['c_type'], None)
    80          if mem:
    81              l.update({'memory': '-m {}'.format(mem.lower())})
    82          else:
    83              l.update({'memory': ''})
    84          # prepare memory limit for the container type
    85          cpu = kwargs.get('cpu', {}).get(l['c_type'], None)
    86          if cpu:
    87              l.update({'cpu': '-c {}'.format(cpu)})
    88          else:
    89              l.update({'cpu': ''})
    90          env.update({'FLEETW_UNIT': name + '.service'})
    91          # construct unit from template
    92          unit = template.format(**l)
    93          # prepare tags only if one was provided
    94          tags = kwargs.get('tags', {})
    95          if tags:
    96              tagset = ' '.join(['"{}={}"'.format(k, v) for k, v in tags.items()])
    97              unit = unit + '\n[X-Fleet]\nX-ConditionMachineMetadata={}\n'.format(tagset)
    98          env.update({'FLEETW_UNIT_DATA': base64.b64encode(unit)})
    99          return subprocess.check_call('fleetctl.sh submit {name}.service'.format(**l),
   100                                       shell=True, env=env)
   101  
   102      def _create_announcer(self, name, image, command, template, env):
   103          l = locals().copy()
   104          l.update(re.match(MATCH, name).groupdict())
   105          env.update({'FLEETW_UNIT': name + '-announce' + '.service'})
   106          env.update({'FLEETW_UNIT_DATA': base64.b64encode(template.format(**l))})
   107          return subprocess.check_call('fleetctl.sh submit {name}-announce.service'.format(**l),  # noqa
   108                                       shell=True, env=env)
   109  
   110      def _create_log(self, name, image, command, template, env):
   111          l = locals().copy()
   112          l.update(re.match(MATCH, name).groupdict())
   113          env.update({'FLEETW_UNIT': name + '-log' + '.service'})
   114          env.update({'FLEETW_UNIT_DATA': base64.b64encode(template.format(**l))})
   115          return subprocess.check_call('fleetctl.sh submit {name}-log.service'.format(**locals()),  # noqa
   116                                       shell=True, env=env)
   117  
   118      def start(self, name, use_announcer=True):
   119          """
   120          Start an idle job
   121          """
   122          print 'Starting {name}'.format(**locals())
   123          env = self.env.copy()
   124          self._start_container(name, env)
   125          self._start_log(name, env)
   126  
   127          if use_announcer:
   128              self._start_announcer(name, env)
   129              self._wait_for_announcer(name, env)
   130          else:
   131              self._log_skipped_announcer('start', name)
   132  
   133      def _start_log(self, name, env):
   134          subprocess.check_call(
   135              'fleetctl.sh start -no-block {name}-log.service'.format(**locals()),
   136              shell=True, env=env)
   137  
   138      def _start_container(self, name, env):
   139          return subprocess.check_call(
   140              'fleetctl.sh start -no-block {name}.service'.format(**locals()),
   141              shell=True, env=env)
   142  
   143      def _start_announcer(self, name, env):
   144          return subprocess.check_call(
   145              'fleetctl.sh start -no-block {name}-announce.service'.format(**locals()),
   146              shell=True, env=env)
   147  
   148      def _wait_for_announcer(self, name, env):
   149          status = None
   150          # we bump to 20 minutes here to match the timeout on the router and in the app unit files
   151          for _ in range(1200):
   152              status = subprocess.check_output(
   153                  "fleetctl.sh list-units --no-legend --fields unit,sub | grep {name}-announce.service | awk '{{print $2}}'".format(**locals()),  # noqa
   154                  shell=True, env=env).strip('\n')
   155              if status == 'running':
   156                  break
   157              time.sleep(1)
   158          else:
   159              raise RuntimeError('Container failed to start')
   160  
   161      def stop(self, name, use_announcer=True):
   162          """
   163          Stop a running job
   164          """
   165          print 'Stopping {name}'.format(**locals())
   166          env = self.env.copy()
   167  
   168          if use_announcer:
   169              self._stop_announcer(name, env)
   170          else:
   171              self._log_skipped_announcer('stop', name)
   172  
   173          self._stop_container(name, env)
   174          self._stop_log(name, env)
   175  
   176      def _stop_container(self, name, env):
   177          return subprocess.check_call(
   178              'fleetctl.sh stop -block-attempts=600 {name}.service'.format(**locals()),
   179              shell=True, env=env)
   180  
   181      def _stop_announcer(self, name, env):
   182          return subprocess.check_call(
   183              'fleetctl.sh stop -block-attempts=600 {name}-announce.service'.format(**locals()),
   184              shell=True, env=env)
   185  
   186      def _stop_log(self, name, env):
   187          return subprocess.check_call(
   188              'fleetctl.sh stop -block-attempts=600 {name}-log.service'.format(**locals()),
   189              shell=True, env=env)
   190  
   191      def destroy(self, name, use_announcer=True):
   192          """
   193          Destroy an existing job
   194          """
   195          print 'Destroying {name}'.format(**locals())
   196          env = self.env.copy()
   197  
   198          if use_announcer:
   199              self._destroy_announcer(name, env)
   200          else:
   201              self._log_skipped_announcer('destroy', name)
   202  
   203          self._destroy_container(name, env)
   204          self._destroy_log(name, env)
   205  
   206      def _destroy_container(self, name, env):
   207          return subprocess.check_call(
   208              'fleetctl.sh destroy {name}.service'.format(**locals()),
   209              shell=True, env=env)
   210  
   211      def _destroy_announcer(self, name, env):
   212          return subprocess.check_call(
   213              'fleetctl.sh destroy {name}-announce.service'.format(**locals()),
   214              shell=True, env=env)
   215  
   216      def _destroy_log(self, name, env):
   217          return subprocess.check_call(
   218              'fleetctl.sh destroy {name}-log.service'.format(**locals()),
   219              shell=True, env=env)
   220  
   221      def run(self, name, image, command):
   222          """
   223          Run a one-off command
   224          """
   225          print 'Running {name}'.format(**locals())
   226          output = subprocess.PIPE
   227          p = subprocess.Popen('fleetrun.sh {command}'.format(**locals()), shell=True, env=self.env,
   228                               stdout=output, stderr=subprocess.STDOUT)
   229          rc = p.wait()
   230          return rc, p.stdout.read()
   231  
   232      def attach(self, name):
   233          """
   234          Attach to a job's stdin, stdout and stderr
   235          """
   236          return StringIO(), StringIO(), StringIO()
   237  
   238  SchedulerClient = FleetClient
   239  
   240  
   241  CONTAINER_TEMPLATE = """
   242  [Unit]
   243  Description={name}
   244  
   245  [Service]
   246  ExecStartPre=/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; docker pull $IMAGE"
   247  ExecStartPre=/bin/sh -c "docker inspect {name} >/dev/null 2>&1 && docker rm -f {name} || true"
   248  ExecStart=/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; port=$(docker inspect -f '{{{{range $k, $v := .ContainerConfig.ExposedPorts }}}}{{{{$k}}}}{{{{end}}}}' $IMAGE | cut -d/ -f1) ; docker run --name {name} {memory} {cpu} -P -e PORT=$port $IMAGE {command}"
   249  ExecStop=/usr/bin/docker rm -f {name}
   250  TimeoutStartSec=20m
   251  """  # noqa
   252  
   253  # TODO revisit the "not getting a port" issue after we upgrade to Docker 1.1.0
   254  ANNOUNCE_TEMPLATE = """
   255  [Unit]
   256  Description={name} announce
   257  BindsTo={name}.service
   258  
   259  [Service]
   260  EnvironmentFile=/etc/environment
   261  ExecStartPre=/bin/sh -c "until docker inspect -f '{{{{range $i, $e := .NetworkSettings.Ports }}}}{{{{$p := index $e 0}}}}{{{{$p.HostPort}}}}{{{{end}}}}' {name} >/dev/null 2>&1; do sleep 2; done; port=$(docker inspect -f '{{{{range $i, $e := .NetworkSettings.Ports }}}}{{{{$p := index $e 0}}}}{{{{$p.HostPort}}}}{{{{end}}}}' {name}); if [[ -z $port ]]; then echo We have no port...; exit 1; fi; echo Waiting for $port/tcp...; until netstat -lnt | grep :$port >/dev/null; do sleep 1; done"
   262  ExecStart=/bin/sh -c "port=$(docker inspect -f '{{{{range $i, $e := .NetworkSettings.Ports }}}}{{{{$p := index $e 0}}}}{{{{$p.HostPort}}}}{{{{end}}}}' {name}); echo Connected to $COREOS_PRIVATE_IPV4:$port/tcp, publishing to etcd...; while netstat -lnt | grep :$port >/dev/null; do etcdctl set /deis/services/{app}/{name} $COREOS_PRIVATE_IPV4:$port --ttl 60 >/dev/null; sleep 45; done"
   263  ExecStop=/usr/bin/etcdctl rm --recursive /deis/services/{app}/{name}
   264  TimeoutStartSec=20m
   265  
   266  [X-Fleet]
   267  X-ConditionMachineOf={name}.service
   268  """  # noqa
   269  
   270  LOG_TEMPLATE = """
   271  [Unit]
   272  Description={name} log
   273  BindsTo={name}.service
   274  
   275  [Service]
   276  ExecStartPre=/bin/sh -c "until docker inspect {name} >/dev/null 2>&1; do sleep 1; done"
   277  ExecStart=/bin/sh -c "docker logs -f {name} 2>&1 | logger -p local0.info -t {app}[{c_type}.{c_num}] --udp --server $(etcdctl get /deis/logs/host) --port $(etcdctl get /deis/logs/port)"
   278  TimeoutStartSec=20m
   279  
   280  [X-Fleet]
   281  X-ConditionMachineOf={name}.service
   282  """  # noqa