github.com/spg/deis@v1.7.3/controller/scheduler/fleet.py (about)

     1  import cStringIO
     2  import base64
     3  import copy
     4  import httplib
     5  import json
     6  import paramiko
     7  import socket
     8  import re
     9  import time
    10  
    11  from django.conf import settings
    12  
    13  from .states import JobState
    14  
    15  
    16  MATCH = re.compile(
    17      '(?P<app>[a-z0-9-]+)_?(?P<version>v[0-9]+)?\.?(?P<c_type>[a-z-_]+)?.(?P<c_num>[0-9]+)')
    18  RETRIES = 3
    19  
    20  
    21  class UHTTPConnection(httplib.HTTPConnection):
    22      """Subclass of Python library HTTPConnection that uses a Unix domain socket.
    23      """
    24  
    25      def __init__(self, path):
    26          httplib.HTTPConnection.__init__(self, 'localhost')
    27          self.path = path
    28  
    29      def connect(self):
    30          sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
    31          sock.connect(self.path)
    32          self.sock = sock
    33  
    34  
    35  class FleetHTTPClient(object):
    36  
    37      def __init__(self, target, auth, options, pkey):
    38          self.target = target
    39          self.auth = auth
    40          self.options = options
    41          self.pkey = pkey
    42          # single global connection
    43          self.conn = UHTTPConnection(self.target)
    44  
    45      # connection helpers
    46  
    47      def _request_unit(self, method, name, body=None):
    48          headers = {'Content-Type': 'application/json'}
    49          self.conn.request(method, '/v1-alpha/units/{name}.service'.format(**locals()),
    50                                    headers=headers, body=json.dumps(body))
    51          return self.conn.getresponse()
    52  
    53      def _get_unit(self, name):
    54          for attempt in xrange(RETRIES):
    55              try:
    56                  resp = self._request_unit('GET', name)
    57                  data = resp.read()
    58                  if not 200 <= resp.status <= 299:
    59                      errmsg = "Failed to retrieve unit: {} {} - {}".format(
    60                          resp.status, resp.reason, data)
    61                      raise RuntimeError(errmsg)
    62                  return data
    63              except:
    64                  if attempt >= (RETRIES - 1):
    65                      raise
    66  
    67      def _put_unit(self, name, body):
    68          for attempt in xrange(RETRIES):
    69              try:
    70                  resp = self._request_unit('PUT', name, body)
    71                  data = resp.read()
    72                  if not 200 <= resp.status <= 299:
    73                      errmsg = "Failed to create unit: {} {} - {}".format(
    74                          resp.status, resp.reason, data)
    75                      raise RuntimeError(errmsg)
    76                  return data
    77              except:
    78                  if attempt >= (RETRIES - 1):
    79                      raise
    80  
    81      def _delete_unit(self, name):
    82          headers = {'Content-Type': 'application/json'}
    83          self.conn.request('DELETE', '/v1-alpha/units/{name}.service'.format(**locals()),
    84                            headers=headers)
    85          resp = self.conn.getresponse()
    86          data = resp.read()
    87          if resp.status not in (404, 204):
    88              errmsg = "Failed to delete unit: {} {} - {}".format(
    89                  resp.status, resp.reason, data)
    90              raise RuntimeError(errmsg)
    91          return data
    92  
    93      def _get_state(self, name=None):
    94          headers = {'Content-Type': 'application/json'}
    95          url = '/v1-alpha/state'
    96          if name:
    97              url += '?unitName={name}.service'.format(**locals())
    98          self.conn.request('GET', url, headers=headers)
    99          resp = self.conn.getresponse()
   100          data = resp.read()
   101          if resp.status not in (200,):
   102              errmsg = "Failed to retrieve state: {} {} - {}".format(
   103                  resp.status, resp.reason, data)
   104              raise RuntimeError(errmsg)
   105          return json.loads(data)
   106  
   107      def _get_machines(self):
   108          headers = {'Content-Type': 'application/json'}
   109          url = '/v1-alpha/machines'
   110          self.conn.request('GET', url, headers=headers)
   111          resp = self.conn.getresponse()
   112          data = resp.read()
   113          if resp.status not in (200,):
   114              errmsg = "Failed to retrieve machines: {} {} - {}".format(
   115                  resp.status, resp.reason, data)
   116              raise RuntimeError(errmsg)
   117          return json.loads(data)
   118  
   119      # container api
   120  
   121      def create(self, name, image, command='', template=None, **kwargs):
   122          """Create a container"""
   123          self._create_container(name, image, command,
   124                                 template or copy.deepcopy(CONTAINER_TEMPLATE), **kwargs)
   125  
   126      def _create_container(self, name, image, command, unit, **kwargs):
   127          l = locals().copy()
   128          l.update(re.match(MATCH, name).groupdict())
   129          # prepare memory limit for the container type
   130          mem = kwargs.get('memory', {}).get(l['c_type'], None)
   131          if mem:
   132              l.update({'memory': '-m {}'.format(mem.lower())})
   133          else:
   134              l.update({'memory': ''})
   135          # prepare memory limit for the container type
   136          cpu = kwargs.get('cpu', {}).get(l['c_type'], None)
   137          if cpu:
   138              l.update({'cpu': '-c {}'.format(cpu)})
   139          else:
   140              l.update({'cpu': ''})
   141          # set unit hostname
   142          l.update({'hostname': self._get_hostname(name)})
   143          # should a special entrypoint be used
   144          entrypoint = kwargs.get('entrypoint')
   145          if entrypoint:
   146              l.update({'entrypoint': '{}'.format(entrypoint)})
   147          # encode command as utf-8
   148          if isinstance(l.get('command'), basestring):
   149              l['command'] = l['command'].encode('utf-8')
   150          # construct unit from template
   151          for f in unit:
   152              f['value'] = f['value'].format(**l)
   153          # prepare tags only if one was provided
   154          tags = kwargs.get('tags', {})
   155          if tags:
   156              tagset = ' '.join(['"{}={}"'.format(k, v) for k, v in tags.viewitems()])
   157              unit.append({"section": "X-Fleet", "name": "MachineMetadata",
   158                           "value": tagset})
   159          # post unit to fleet
   160          self._put_unit(name, {"desiredState": "loaded", "options": unit})
   161  
   162      def _get_hostname(self, application_name):
   163          hostname = settings.UNIT_HOSTNAME
   164          if hostname == "default":
   165              return ''
   166          elif hostname == "application":
   167              # replace underscore with dots, since underscore is not valid in DNS hostnames
   168              dns_name = application_name.replace("_", ".")
   169              return '-h ' + dns_name
   170          elif hostname == "server":
   171              return '-h %H'
   172          else:
   173              raise RuntimeError('Unsupported hostname: ' + hostname)
   174  
   175      def start(self, name):
   176          """Start a container"""
   177          self._put_unit(name, {'desiredState': 'launched'})
   178          self._wait_for_container_running(name)
   179  
   180      def _wait_for_container_state(self, name):
   181          # wait for container to get scheduled
   182          for _ in xrange(30):
   183              states = self._get_state(name)
   184              if states and len(states.get('states', [])) == 1:
   185                  return states.get('states')[0]
   186              time.sleep(1)
   187          else:
   188              raise RuntimeError('container timeout while retrieving state')
   189  
   190      def _wait_for_container_running(self, name):
   191          # we bump to 20 minutes here to match the timeout on the router and in the app unit files
   192          try:
   193              self._wait_for_job_state(name, JobState.up)
   194          except RuntimeError:
   195              raise RuntimeError('container failed to start')
   196  
   197      def _wait_for_job_state(self, name, state):
   198          # we bump to 20 minutes here to match the timeout on the router and in the app unit files
   199          for _ in xrange(1200):
   200              if self.state(name) == state:
   201                  return
   202              time.sleep(1)
   203          else:
   204              raise RuntimeError('timeout waiting for job state: {}'.format(state))
   205  
   206      def _wait_for_destroy(self, name):
   207          for _ in xrange(30):
   208              if not self._get_state(name):
   209                  break
   210              time.sleep(1)
   211          else:
   212              raise RuntimeError('timeout on container destroy')
   213  
   214      def stop(self, name):
   215          """Stop a container"""
   216          self._put_unit(name, {"desiredState": "loaded"})
   217          self._wait_for_job_state(name, JobState.created)
   218  
   219      def destroy(self, name):
   220          """Destroy a container"""
   221          # call all destroy functions, ignoring any errors
   222          try:
   223              self._destroy_container(name)
   224          except:
   225              pass
   226          self._wait_for_destroy(name)
   227  
   228      def _destroy_container(self, name):
   229          for attempt in xrange(RETRIES):
   230              try:
   231                  self._delete_unit(name)
   232                  break
   233              except:
   234                  if attempt == (RETRIES - 1):  # account for 0 indexing
   235                      raise
   236  
   237      def run(self, name, image, entrypoint, command):  # noqa
   238          """Run a one-off command"""
   239          self._create_container(name, image, command, copy.deepcopy(RUN_TEMPLATE),
   240                                 entrypoint=entrypoint)
   241          # launch the container
   242          self._put_unit(name, {'desiredState': 'launched'})
   243          # wait for the container to get scheduled
   244          state = self._wait_for_container_state(name)
   245  
   246          try:
   247              machineID = state.get('machineID')
   248  
   249              # find the machine
   250              machines = self._get_machines()
   251              if not machines:
   252                  raise RuntimeError('no available hosts to run command')
   253  
   254              # find the machine's primaryIP
   255              primaryIP = None
   256              for m in machines.get('machines', []):
   257                  if m['id'] == machineID:
   258                      primaryIP = m['primaryIP']
   259              if not primaryIP:
   260                  raise RuntimeError('could not find host')
   261  
   262              # prepare ssh key
   263              file_obj = cStringIO.StringIO(base64.b64decode(self.pkey))
   264              pkey = paramiko.RSAKey(file_obj=file_obj)
   265  
   266              # grab output via docker logs over SSH
   267              ssh = paramiko.SSHClient()
   268              ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
   269              ssh.connect(primaryIP, username="core", pkey=pkey)
   270              # share a transport
   271              tran = ssh.get_transport()
   272  
   273              def _do_ssh(cmd):
   274                  chan = tran.open_session()
   275                  # get a pty so stdout/stderr look right
   276                  chan.get_pty()
   277                  out = chan.makefile()
   278                  chan.exec_command(cmd)
   279                  output = out.read()
   280                  rc = chan.recv_exit_status()
   281                  return rc, output
   282  
   283              # wait for container to launch
   284              # we loop indefinitely here, as we have no idea how long the docker pull will take
   285              while True:
   286                  rc, _ = _do_ssh('docker inspect {name}'.format(**locals()))
   287                  if rc == 0:
   288                      break
   289                  time.sleep(1)
   290              else:
   291                  raise RuntimeError('failed to create container')
   292  
   293              # wait for container to start
   294              for _ in xrange(2):
   295                  _rc, _output = _do_ssh('docker inspect {name}'.format(**locals()))
   296                  if _rc != 0:
   297                      raise RuntimeError('failed to inspect container')
   298                  _container = json.loads(_output)
   299                  started_at = _container[0]["State"]["StartedAt"]
   300                  if not started_at.startswith('0001'):
   301                      break
   302                  time.sleep(1)
   303              else:
   304                  raise RuntimeError('container failed to start')
   305  
   306              # wait for container to complete
   307              for _ in xrange(1200):
   308                  _rc, _output = _do_ssh('docker inspect {name}'.format(**locals()))
   309                  if _rc != 0:
   310                      raise RuntimeError('failed to inspect container')
   311                  _container = json.loads(_output)
   312                  finished_at = _container[0]["State"]["FinishedAt"]
   313                  if not finished_at.startswith('0001'):
   314                      break
   315                  time.sleep(1)
   316              else:
   317                  raise RuntimeError('container timed out')
   318  
   319              # gather container output
   320              _rc, output = _do_ssh('docker logs {name}'.format(**locals()))
   321              if _rc != 0:
   322                  raise RuntimeError('could not attach to container')
   323  
   324              # determine container exit code
   325              _rc, _output = _do_ssh('docker inspect {name}'.format(**locals()))
   326              if _rc != 0:
   327                  raise RuntimeError('could not determine exit code')
   328              container = json.loads(_output)
   329              rc = container[0]["State"]["ExitCode"]
   330  
   331          finally:
   332              # cleanup
   333              self._destroy_container(name)
   334              self._wait_for_destroy(name)
   335  
   336          # return rc and output
   337          return rc, output
   338  
   339      def state(self, name):
   340          systemdActiveStateMap = {
   341              "active": "up",
   342              "reloading": "down",
   343              "inactive": "created",
   344              "failed": "crashed",
   345              "activating": "down",
   346              "deactivating": "down",
   347          }
   348          try:
   349              # NOTE (bacongobbler): this call to ._get_unit() acts as a pre-emptive check to
   350              # determine if the job no longer exists (will raise a RuntimeError on 404)
   351              self._get_unit(name)
   352              state = self._wait_for_container_state(name)
   353              activeState = state['systemdActiveState']
   354              # FIXME (bacongobbler): when fleet loads a job, sometimes it'll automatically start and
   355              # stop the container, which in our case will return as 'failed', even though
   356              # the container is perfectly fine.
   357              if activeState == 'failed':
   358                  if state['systemdLoadState'] == 'loaded':
   359                      return JobState.created
   360              return getattr(JobState, systemdActiveStateMap[activeState])
   361          except KeyError:
   362              # failed retrieving a proper response from the fleet API
   363              return JobState.error
   364          except RuntimeError:
   365              # failed to retrieve a response from the fleet API,
   366              # which means it does not exist
   367              return JobState.destroyed
   368  
   369      def attach(self, name):
   370          """
   371          Attach to a job's stdin, stdout and stderr
   372          """
   373          raise NotImplementedError
   374  
   375  SchedulerClient = FleetHTTPClient
   376  
   377  
   378  CONTAINER_TEMPLATE = [
   379      {"section": "Unit", "name": "Description", "value": "{name}"},
   380      {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; docker pull $IMAGE"'''},  # noqa
   381      {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "docker inspect {name} >/dev/null 2>&1 && docker rm -f {name} || true"'''},  # noqa
   382      {"section": "Service", "name": "ExecStart", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; docker run --name {name} {memory} {cpu} {hostname} -P $IMAGE {command}"'''},  # noqa
   383      {"section": "Service", "name": "ExecStop", "value": '''/usr/bin/docker stop {name}'''},
   384      {"section": "Service", "name": "ExecStop", "value": '''/usr/bin/docker rm -f {name}'''},
   385      {"section": "Service", "name": "TimeoutStartSec", "value": "20m"},
   386      {"section": "Service", "name": "TimeoutStopSec", "value": "10"},
   387      {"section": "Service", "name": "RestartSec", "value": "5"},
   388      {"section": "Service", "name": "Restart", "value": "on-failure"},
   389  ]
   390  
   391  
   392  RUN_TEMPLATE = [
   393      {"section": "Unit", "name": "Description", "value": "{name} admin command"},
   394      {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; docker pull $IMAGE"'''},  # noqa
   395      {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "docker inspect {name} >/dev/null 2>&1 && docker rm -f {name} || true"'''},  # noqa
   396      {"section": "Service", "name": "ExecStart", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; docker run --name {name} --entrypoint={entrypoint} -a stdout -a stderr $IMAGE {command}"'''},  # noqa
   397      {"section": "Service", "name": "TimeoutStartSec", "value": "20m"},
   398  ]