github.com/chasestarr/deis@v1.13.5-0.20170519182049-1d9e59fbdbfc/controller/scheduler/fleet.py (about)

     1  import base64
     2  import copy
     3  import cStringIO
     4  import httplib
     5  import json
     6  import paramiko
     7  import re
     8  import socket
     9  import time
    10  
    11  from django.conf import settings
    12  
    13  from . import AbstractSchedulerClient
    14  from .states import JobState
    15  
    16  
    17  MATCH = re.compile(
    18      '(?P<app>[a-z0-9-]+)_?(?P<version>v[0-9]+)?\.?(?P<c_type>[a-z-_]+)?.(?P<c_num>[0-9]+)')
    19  RETRIES = 3
    20  
    21  
    22  class UHTTPConnection(httplib.HTTPConnection):
    23      """Subclass of Python library HTTPConnection that uses a Unix domain socket.
    24      """
    25  
    26      def __init__(self, path):
    27          httplib.HTTPConnection.__init__(self, 'localhost')
    28          self.path = path
    29  
    30      def connect(self):
    31          sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
    32          sock.connect(self.path)
    33          self.sock = sock
    34  
    35  
    36  class FleetHTTPClient(AbstractSchedulerClient):
    37  
    38      def __init__(self, target, auth, options, pkey):
    39          super(FleetHTTPClient, self).__init__(target, auth, options, pkey)
    40          # single global connection
    41          self.conn = UHTTPConnection(self.target)
    42  
    43      # connection helpers
    44  
    45      def _request_unit(self, method, name, body=None):
    46          headers = {'Content-Type': 'application/json'}
    47          self.conn.request(method, '/v1-alpha/units/{name}.service'.format(**locals()),
    48                                    headers=headers, body=json.dumps(body))
    49          return self.conn.getresponse()
    50  
    51      def _get_unit(self, name):
    52          for attempt in xrange(RETRIES):
    53              try:
    54                  resp = self._request_unit('GET', name)
    55                  data = resp.read()
    56                  if not 200 <= resp.status <= 299:
    57                      errmsg = "Failed to retrieve unit: {} {} - {}".format(
    58                          resp.status, resp.reason, data)
    59                      raise RuntimeError(errmsg)
    60                  return data
    61              except:
    62                  if attempt >= (RETRIES - 1):
    63                      raise
    64  
    65      def _put_unit(self, name, body):
    66          for attempt in xrange(RETRIES):
    67              try:
    68                  resp = self._request_unit('PUT', name, body)
    69                  data = resp.read()
    70                  if not 200 <= resp.status <= 299:
    71                      errmsg = "Failed to create unit: {} {} - {}".format(
    72                          resp.status, resp.reason, data)
    73                      raise RuntimeError(errmsg)
    74                  return data
    75              except:
    76                  if attempt >= (RETRIES - 1):
    77                      raise
    78  
    79      def _delete_unit(self, name):
    80          headers = {'Content-Type': 'application/json'}
    81          self.conn.request('DELETE', '/v1-alpha/units/{name}.service'.format(**locals()),
    82                            headers=headers)
    83          resp = self.conn.getresponse()
    84          data = resp.read()
    85          if resp.status not in (404, 204):
    86              errmsg = "Failed to delete unit: {} {} - {}".format(
    87                  resp.status, resp.reason, data)
    88              raise RuntimeError(errmsg)
    89          return data
    90  
    91      def _get_state(self, name=None):
    92          headers = {'Content-Type': 'application/json'}
    93          url = '/v1-alpha/state'
    94          if name:
    95              url += '?unitName={name}.service'.format(**locals())
    96          self.conn.request('GET', url, headers=headers)
    97          resp = self.conn.getresponse()
    98          data = resp.read()
    99          if resp.status not in (200,):
   100              errmsg = "Failed to retrieve state: {} {} - {}".format(
   101                  resp.status, resp.reason, data)
   102              raise RuntimeError(errmsg)
   103          return json.loads(data)
   104  
   105      def _get_machines(self):
   106          headers = {'Content-Type': 'application/json'}
   107          url = '/v1-alpha/machines'
   108          self.conn.request('GET', url, headers=headers)
   109          resp = self.conn.getresponse()
   110          data = resp.read()
   111          if resp.status not in (200,):
   112              errmsg = "Failed to retrieve machines: {} {} - {}".format(
   113                  resp.status, resp.reason, data)
   114              raise RuntimeError(errmsg)
   115          return json.loads(data)
   116  
   117      # container api
   118  
   119      def create(self, name, image, command='', template=None, **kwargs):
   120          """Create a container."""
   121          self._create_container(name, image, command,
   122                                 template or copy.deepcopy(CONTAINER_TEMPLATE), **kwargs)
   123  
   124      def _create_container(self, name, image, command, unit, **kwargs):
   125          l = locals().copy()
   126          l.update(re.match(MATCH, name).groupdict())
   127          # prepare memory limit for the container type
   128          mem = kwargs.get('memory', {}).get(l['c_type'], None)
   129          if mem:
   130              l.update({'memory': '-m {} {}'.format(mem.lower(), settings.DISABLE_SWAP)})
   131          else:
   132              l.update({'memory': ''})
   133          # prepare memory limit for the container type
   134          cpu = kwargs.get('cpu', {}).get(l['c_type'], None)
   135          if cpu:
   136              l.update({'cpu': '-c {}'.format(cpu)})
   137          else:
   138              l.update({'cpu': ''})
   139          # set unit hostname
   140          l.update({'hostname': self._get_hostname(name)})
   141          # should a special entrypoint be used
   142          entrypoint = kwargs.get('entrypoint')
   143          if entrypoint:
   144              l.update({'entrypoint': '{}'.format(entrypoint)})
   145          # encode command as utf-8
   146          if isinstance(l.get('command'), basestring):
   147              l['command'] = l['command'].encode('utf-8')
   148          # construct unit from template
   149          for f in unit:
   150              f['value'] = f['value'].format(**l)
   151          # prepare tags only if one was provided
   152          tags = kwargs.get('tags', {})
   153          unit_tags = tags.viewitems()
   154          if settings.ENABLE_PLACEMENT_OPTIONS in ['true', 'True', 'TRUE', '1']:
   155              tags['dataPlane'] = 'true'
   156          if unit_tags:
   157              tagset = ' '.join(['"{}={}"'.format(k, v) for k, v in unit_tags])
   158              unit.append({"section": "X-Fleet", "name": "MachineMetadata",
   159                           "value": tagset})
   160          # post unit to fleet
   161          self._put_unit(name, {"desiredState": "loaded", "options": unit})
   162  
   163      def _get_hostname(self, application_name):
   164          hostname = settings.UNIT_HOSTNAME
   165          if hostname == "default":
   166              return ''
   167          elif hostname == "application":
   168              # replace underscore with dots, since underscore is not valid in DNS hostnames
   169              dns_name = application_name.replace("_", ".")
   170              return '-h ' + dns_name
   171          elif hostname == "server":
   172              return '-h %H'
   173          else:
   174              raise RuntimeError('Unsupported hostname: ' + hostname)
   175  
   176      def start(self, name):
   177          """Start a container."""
   178          self._put_unit(name, {'desiredState': 'launched'})
   179          self._wait_for_container_running(name)
   180  
   181      def _wait_for_container_state(self, name):
   182          # wait for container to get scheduled
   183          for _ in xrange(30):
   184              states = self._get_state(name)
   185              if states and len(states.get('states', [])) == 1:
   186                  return states.get('states')[0]
   187              time.sleep(1)
   188          else:
   189              raise RuntimeError('container timeout while retrieving state')
   190  
   191      def _wait_for_container_running(self, name):
   192          # we bump to 20 minutes here to match the timeout on the router and in the app unit files
   193          try:
   194              self._wait_for_job_state(name, JobState.up)
   195          except RuntimeError:
   196              raise RuntimeError('container failed to start')
   197  
   198      def _wait_for_job_state(self, name, state):
   199          # we bump to 20 minutes here to match the timeout on the router and in the app unit files
   200          for _ in xrange(1200):
   201              if self.state(name) == state:
   202                  return
   203              time.sleep(1)
   204          else:
   205              raise RuntimeError('timeout waiting for job state: {}'.format(state))
   206  
   207      def _wait_for_destroy(self, name):
   208          for _ in xrange(30):
   209              if not self._get_state(name):
   210                  break
   211              time.sleep(1)
   212          else:
   213              raise RuntimeError('timeout on container destroy')
   214  
   215      def stop(self, name):
   216          """Stop a container."""
   217          self._put_unit(name, {"desiredState": "loaded"})
   218          self._wait_for_job_state(name, JobState.created)
   219  
   220      def destroy(self, name):
   221          """Destroy a container."""
   222          # call all destroy functions, ignoring any errors
   223          try:
   224              self._destroy_container(name)
   225          except:
   226              pass
   227          self._wait_for_destroy(name)
   228  
   229      def _destroy_container(self, name):
   230          for attempt in xrange(RETRIES):
   231              try:
   232                  self._delete_unit(name)
   233                  break
   234              except:
   235                  if attempt == (RETRIES - 1):  # account for 0 indexing
   236                      raise
   237  
   238      def run(self, name, image, entrypoint, command):  # noqa
   239          """Run a one-off command."""
   240          self._create_container(name, image, command, copy.deepcopy(RUN_TEMPLATE),
   241                                 entrypoint=entrypoint)
   242          # launch the container
   243          self._put_unit(name, {'desiredState': 'launched'})
   244          # wait for the container to get scheduled
   245          state = self._wait_for_container_state(name)
   246  
   247          try:
   248              machineID = state.get('machineID')
   249  
   250              # find the machine
   251              machines = self._get_machines()
   252              if not machines:
   253                  raise RuntimeError('no available hosts to run command')
   254  
   255              # find the machine's primaryIP
   256              primaryIP = None
   257              for m in machines.get('machines', []):
   258                  if m['id'] == machineID:
   259                      primaryIP = m['primaryIP']
   260              if not primaryIP:
   261                  raise RuntimeError('could not find host')
   262  
   263              # prepare ssh key
   264              file_obj = cStringIO.StringIO(base64.b64decode(self.pkey))
   265              pkey = paramiko.RSAKey(file_obj=file_obj)
   266  
   267              # grab output via docker logs over SSH
   268              ssh = paramiko.SSHClient()
   269              ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
   270              ssh.connect(primaryIP, username="core", pkey=pkey)
   271              # share a transport
   272              tran = ssh.get_transport()
   273  
   274              def _do_ssh(cmd):
   275                  with tran.open_session() as chan:
   276                      chan.exec_command(cmd)
   277                      while not chan.exit_status_ready():
   278                          time.sleep(1)
   279                      out = chan.makefile()
   280                      output = out.read()
   281                      rc = chan.recv_exit_status()
   282                      return rc, output
   283  
   284              # wait for container to launch
   285              # we loop indefinitely here, as we have no idea how long the docker pull will take
   286              while True:
   287                  rc, _ = _do_ssh('docker inspect {name}'.format(**locals()))
   288                  if rc == 0:
   289                      break
   290                  time.sleep(1)
   291              else:
   292                  raise RuntimeError('failed to create container')
   293  
   294              # wait for container to start
   295              for _ in xrange(2):
   296                  _rc, _output = _do_ssh('docker inspect {name}'.format(**locals()))
   297                  if _rc != 0:
   298                      raise RuntimeError('failed to inspect container')
   299                  _container = json.loads(_output)
   300                  started_at = _container[0]["State"]["StartedAt"]
   301                  if not started_at.startswith('0001'):
   302                      break
   303                  time.sleep(1)
   304              else:
   305                  raise RuntimeError('container failed to start')
   306  
   307              # wait for container to complete
   308              for _ in xrange(1200):
   309                  _rc, _output = _do_ssh('docker inspect {name}'.format(**locals()))
   310                  if _rc != 0:
   311                      raise RuntimeError('failed to inspect container')
   312                  _container = json.loads(_output)
   313                  finished_at = _container[0]["State"]["FinishedAt"]
   314                  if not finished_at.startswith('0001'):
   315                      break
   316                  time.sleep(1)
   317              else:
   318                  raise RuntimeError('container timed out')
   319  
   320              # gather container output
   321              _rc, output = _do_ssh('docker logs {name}'.format(**locals()))
   322              if _rc != 0:
   323                  raise RuntimeError('could not attach to container')
   324  
   325              # determine container exit code
   326              _rc, _output = _do_ssh('docker inspect {name}'.format(**locals()))
   327              if _rc != 0:
   328                  raise RuntimeError('could not determine exit code')
   329              container = json.loads(_output)
   330              rc = container[0]["State"]["ExitCode"]
   331  
   332          finally:
   333              # cleanup
   334              self._destroy_container(name)
   335              self._wait_for_destroy(name)
   336  
   337          # return rc and output
   338          return rc, output
   339  
   340      def state(self, name):
   341          """Display the given job's running state."""
   342          systemdActiveStateMap = {
   343              'active': 'up',
   344              'reloading': 'down',
   345              'inactive': 'created',
   346              'failed': 'crashed',
   347              'activating': 'down',
   348              'deactivating': 'down',
   349          }
   350          try:
   351              # NOTE (bacongobbler): this call to ._get_unit() acts as a pre-emptive check to
   352              # determine if the job no longer exists (will raise a RuntimeError on 404)
   353              self._get_unit(name)
   354              state = self._wait_for_container_state(name)
   355              activeState = state['systemdActiveState']
   356              # FIXME (bacongobbler): when fleet loads a job, sometimes it'll automatically start and
   357              # stop the container, which in our case will return as 'failed', even though
   358              # the container is perfectly fine.
   359              if activeState == 'failed' and state['systemdLoadState'] == 'loaded':
   360                  return JobState.created
   361              return getattr(JobState, systemdActiveStateMap[activeState])
   362          except KeyError:
   363              # failed retrieving a proper response from the fleet API
   364              return JobState.error
   365          except RuntimeError:
   366              # failed to retrieve a response from the fleet API,
   367              # which means it does not exist
   368              return JobState.destroyed
   369  
   370  SchedulerClient = FleetHTTPClient
   371  
   372  
   373  CONTAINER_TEMPLATE = [
   374      {"section": "Unit", "name": "Description", "value": "{name}"},
   375      {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; docker pull $IMAGE"'''},  # noqa
   376      {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "docker inspect {name} >/dev/null 2>&1 && docker rm -f {name} || true"'''},  # noqa
   377      {"section": "Service", "name": "ExecStart", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; docker run --name {name} --rm {memory} {cpu} {hostname} -P $IMAGE {command}"'''},  # noqa
   378      {"section": "Service", "name": "ExecStop", "value": '''/usr/bin/docker stop {name}'''},
   379      {"section": "Service", "name": "TimeoutStartSec", "value": "20m"},
   380      {"section": "Service", "name": "TimeoutStopSec", "value": "10"},
   381      {"section": "Service", "name": "RestartSec", "value": "5"},
   382      {"section": "Service", "name": "Restart", "value": "on-failure"},
   383  ]
   384  
   385  
   386  RUN_TEMPLATE = [
   387      {"section": "Unit", "name": "Description", "value": "{name} admin command"},
   388      {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; docker pull $IMAGE"'''},  # noqa
   389      {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "docker inspect {name} >/dev/null 2>&1 && docker rm -f {name} || true"'''},  # noqa
   390      {"section": "Service", "name": "ExecStart", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; docker run --name {name} --entrypoint={entrypoint} -a stdout -a stderr $IMAGE {command}"'''},  # noqa
   391      {"section": "Service", "name": "TimeoutStartSec", "value": "20m"},
   392  ]