github.com/dustinrc/deis@v1.10.1-0.20150917223407-0894a5fb979e/controller/scheduler/fleet.py (about)

     1  import base64
     2  import copy
     3  import cStringIO
     4  import httplib
     5  import json
     6  import paramiko
     7  import re
     8  import socket
     9  import time
    10  
    11  from django.conf import settings
    12  
    13  from . import AbstractSchedulerClient
    14  from .states import JobState
    15  
    16  
    17  MATCH = re.compile(
    18      '(?P<app>[a-z0-9-]+)_?(?P<version>v[0-9]+)?\.?(?P<c_type>[a-z-_]+)?.(?P<c_num>[0-9]+)')
    19  RETRIES = 3
    20  
    21  
    22  class UHTTPConnection(httplib.HTTPConnection):
    23      """Subclass of Python library HTTPConnection that uses a Unix domain socket.
    24      """
    25  
    26      def __init__(self, path):
    27          httplib.HTTPConnection.__init__(self, 'localhost')
    28          self.path = path
    29  
    30      def connect(self):
    31          sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
    32          sock.connect(self.path)
    33          self.sock = sock
    34  
    35  
    36  class FleetHTTPClient(AbstractSchedulerClient):
    37  
    38      def __init__(self, target, auth, options, pkey):
    39          super(FleetHTTPClient, self).__init__(target, auth, options, pkey)
    40          # single global connection
    41          self.conn = UHTTPConnection(self.target)
    42  
    43      # connection helpers
    44  
    45      def _request_unit(self, method, name, body=None):
    46          headers = {'Content-Type': 'application/json'}
    47          self.conn.request(method, '/v1-alpha/units/{name}.service'.format(**locals()),
    48                                    headers=headers, body=json.dumps(body))
    49          return self.conn.getresponse()
    50  
    51      def _get_unit(self, name):
    52          for attempt in xrange(RETRIES):
    53              try:
    54                  resp = self._request_unit('GET', name)
    55                  data = resp.read()
    56                  if not 200 <= resp.status <= 299:
    57                      errmsg = "Failed to retrieve unit: {} {} - {}".format(
    58                          resp.status, resp.reason, data)
    59                      raise RuntimeError(errmsg)
    60                  return data
    61              except:
    62                  if attempt >= (RETRIES - 1):
    63                      raise
    64  
    65      def _put_unit(self, name, body):
    66          for attempt in xrange(RETRIES):
    67              try:
    68                  resp = self._request_unit('PUT', name, body)
    69                  data = resp.read()
    70                  if not 200 <= resp.status <= 299:
    71                      errmsg = "Failed to create unit: {} {} - {}".format(
    72                          resp.status, resp.reason, data)
    73                      raise RuntimeError(errmsg)
    74                  return data
    75              except:
    76                  if attempt >= (RETRIES - 1):
    77                      raise
    78  
    79      def _delete_unit(self, name):
    80          headers = {'Content-Type': 'application/json'}
    81          self.conn.request('DELETE', '/v1-alpha/units/{name}.service'.format(**locals()),
    82                            headers=headers)
    83          resp = self.conn.getresponse()
    84          data = resp.read()
    85          if resp.status not in (404, 204):
    86              errmsg = "Failed to delete unit: {} {} - {}".format(
    87                  resp.status, resp.reason, data)
    88              raise RuntimeError(errmsg)
    89          return data
    90  
    91      def _get_state(self, name=None):
    92          headers = {'Content-Type': 'application/json'}
    93          url = '/v1-alpha/state'
    94          if name:
    95              url += '?unitName={name}.service'.format(**locals())
    96          self.conn.request('GET', url, headers=headers)
    97          resp = self.conn.getresponse()
    98          data = resp.read()
    99          if resp.status not in (200,):
   100              errmsg = "Failed to retrieve state: {} {} - {}".format(
   101                  resp.status, resp.reason, data)
   102              raise RuntimeError(errmsg)
   103          return json.loads(data)
   104  
   105      def _get_machines(self):
   106          headers = {'Content-Type': 'application/json'}
   107          url = '/v1-alpha/machines'
   108          self.conn.request('GET', url, headers=headers)
   109          resp = self.conn.getresponse()
   110          data = resp.read()
   111          if resp.status not in (200,):
   112              errmsg = "Failed to retrieve machines: {} {} - {}".format(
   113                  resp.status, resp.reason, data)
   114              raise RuntimeError(errmsg)
   115          return json.loads(data)
   116  
   117      # container api
   118  
   119      def create(self, name, image, command='', template=None, **kwargs):
   120          """Create a container."""
   121          self._create_container(name, image, command,
   122                                 template or copy.deepcopy(CONTAINER_TEMPLATE), **kwargs)
   123  
   124      def _create_container(self, name, image, command, unit, **kwargs):
   125          l = locals().copy()
   126          l.update(re.match(MATCH, name).groupdict())
   127          # prepare memory limit for the container type
   128          mem = kwargs.get('memory', {}).get(l['c_type'], None)
   129          if mem:
   130              l.update({'memory': '-m {}'.format(mem.lower())})
   131          else:
   132              l.update({'memory': ''})
   133          # prepare memory limit for the container type
   134          cpu = kwargs.get('cpu', {}).get(l['c_type'], None)
   135          if cpu:
   136              l.update({'cpu': '-c {}'.format(cpu)})
   137          else:
   138              l.update({'cpu': ''})
   139          # set unit hostname
   140          l.update({'hostname': self._get_hostname(name)})
   141          # should a special entrypoint be used
   142          entrypoint = kwargs.get('entrypoint')
   143          if entrypoint:
   144              l.update({'entrypoint': '{}'.format(entrypoint)})
   145          # encode command as utf-8
   146          if isinstance(l.get('command'), basestring):
   147              l['command'] = l['command'].encode('utf-8')
   148          # construct unit from template
   149          for f in unit:
   150              f['value'] = f['value'].format(**l)
   151          # prepare tags only if one was provided
   152          tags = kwargs.get('tags', {})
   153          tagset = ' '.join(['"{}={}"'.format(k, v) for k, v in tags.viewitems()])
   154          if settings.ENABLE_PLACEMENT_OPTIONS in ['true', 'True', 'TRUE', '1']:
   155              unit.append({"section": "X-Fleet", "name": "MachineMetadata",
   156                           "value": tagset + ' "dataPlane=true"'})
   157          # post unit to fleet
   158          self._put_unit(name, {"desiredState": "loaded", "options": unit})
   159  
   160      def _get_hostname(self, application_name):
   161          hostname = settings.UNIT_HOSTNAME
   162          if hostname == "default":
   163              return ''
   164          elif hostname == "application":
   165              # replace underscore with dots, since underscore is not valid in DNS hostnames
   166              dns_name = application_name.replace("_", ".")
   167              return '-h ' + dns_name
   168          elif hostname == "server":
   169              return '-h %H'
   170          else:
   171              raise RuntimeError('Unsupported hostname: ' + hostname)
   172  
   173      def start(self, name):
   174          """Start a container."""
   175          self._put_unit(name, {'desiredState': 'launched'})
   176          self._wait_for_container_running(name)
   177  
   178      def _wait_for_container_state(self, name):
   179          # wait for container to get scheduled
   180          for _ in xrange(30):
   181              states = self._get_state(name)
   182              if states and len(states.get('states', [])) == 1:
   183                  return states.get('states')[0]
   184              time.sleep(1)
   185          else:
   186              raise RuntimeError('container timeout while retrieving state')
   187  
   188      def _wait_for_container_running(self, name):
   189          # we bump to 20 minutes here to match the timeout on the router and in the app unit files
   190          try:
   191              self._wait_for_job_state(name, JobState.up)
   192          except RuntimeError:
   193              raise RuntimeError('container failed to start')
   194  
   195      def _wait_for_job_state(self, name, state):
   196          # we bump to 20 minutes here to match the timeout on the router and in the app unit files
   197          for _ in xrange(1200):
   198              if self.state(name) == state:
   199                  return
   200              time.sleep(1)
   201          else:
   202              raise RuntimeError('timeout waiting for job state: {}'.format(state))
   203  
   204      def _wait_for_destroy(self, name):
   205          for _ in xrange(30):
   206              if not self._get_state(name):
   207                  break
   208              time.sleep(1)
   209          else:
   210              raise RuntimeError('timeout on container destroy')
   211  
   212      def stop(self, name):
   213          """Stop a container."""
   214          self._put_unit(name, {"desiredState": "loaded"})
   215          self._wait_for_job_state(name, JobState.created)
   216  
   217      def destroy(self, name):
   218          """Destroy a container."""
   219          # call all destroy functions, ignoring any errors
   220          try:
   221              self._destroy_container(name)
   222          except:
   223              pass
   224          self._wait_for_destroy(name)
   225  
   226      def _destroy_container(self, name):
   227          for attempt in xrange(RETRIES):
   228              try:
   229                  self._delete_unit(name)
   230                  break
   231              except:
   232                  if attempt == (RETRIES - 1):  # account for 0 indexing
   233                      raise
   234  
   235      def run(self, name, image, entrypoint, command):  # noqa
   236          """Run a one-off command."""
   237          self._create_container(name, image, command, copy.deepcopy(RUN_TEMPLATE),
   238                                 entrypoint=entrypoint)
   239          # launch the container
   240          self._put_unit(name, {'desiredState': 'launched'})
   241          # wait for the container to get scheduled
   242          state = self._wait_for_container_state(name)
   243  
   244          try:
   245              machineID = state.get('machineID')
   246  
   247              # find the machine
   248              machines = self._get_machines()
   249              if not machines:
   250                  raise RuntimeError('no available hosts to run command')
   251  
   252              # find the machine's primaryIP
   253              primaryIP = None
   254              for m in machines.get('machines', []):
   255                  if m['id'] == machineID:
   256                      primaryIP = m['primaryIP']
   257              if not primaryIP:
   258                  raise RuntimeError('could not find host')
   259  
   260              # prepare ssh key
   261              file_obj = cStringIO.StringIO(base64.b64decode(self.pkey))
   262              pkey = paramiko.RSAKey(file_obj=file_obj)
   263  
   264              # grab output via docker logs over SSH
   265              ssh = paramiko.SSHClient()
   266              ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
   267              ssh.connect(primaryIP, username="core", pkey=pkey)
   268              # share a transport
   269              tran = ssh.get_transport()
   270  
   271              def _do_ssh(cmd):
   272                  with tran.open_session() as chan:
   273                      chan.exec_command(cmd)
   274                      while not chan.exit_status_ready():
   275                          time.sleep(1)
   276                      out = chan.makefile()
   277                      output = out.read()
   278                      rc = chan.recv_exit_status()
   279                      return rc, output
   280  
   281              # wait for container to launch
   282              # we loop indefinitely here, as we have no idea how long the docker pull will take
   283              while True:
   284                  rc, _ = _do_ssh('docker inspect {name}'.format(**locals()))
   285                  if rc == 0:
   286                      break
   287                  time.sleep(1)
   288              else:
   289                  raise RuntimeError('failed to create container')
   290  
   291              # wait for container to start
   292              for _ in xrange(2):
   293                  _rc, _output = _do_ssh('docker inspect {name}'.format(**locals()))
   294                  if _rc != 0:
   295                      raise RuntimeError('failed to inspect container')
   296                  _container = json.loads(_output)
   297                  started_at = _container[0]["State"]["StartedAt"]
   298                  if not started_at.startswith('0001'):
   299                      break
   300                  time.sleep(1)
   301              else:
   302                  raise RuntimeError('container failed to start')
   303  
   304              # wait for container to complete
   305              for _ in xrange(1200):
   306                  _rc, _output = _do_ssh('docker inspect {name}'.format(**locals()))
   307                  if _rc != 0:
   308                      raise RuntimeError('failed to inspect container')
   309                  _container = json.loads(_output)
   310                  finished_at = _container[0]["State"]["FinishedAt"]
   311                  if not finished_at.startswith('0001'):
   312                      break
   313                  time.sleep(1)
   314              else:
   315                  raise RuntimeError('container timed out')
   316  
   317              # gather container output
   318              _rc, output = _do_ssh('docker logs {name}'.format(**locals()))
   319              if _rc != 0:
   320                  raise RuntimeError('could not attach to container')
   321  
   322              # determine container exit code
   323              _rc, _output = _do_ssh('docker inspect {name}'.format(**locals()))
   324              if _rc != 0:
   325                  raise RuntimeError('could not determine exit code')
   326              container = json.loads(_output)
   327              rc = container[0]["State"]["ExitCode"]
   328  
   329          finally:
   330              # cleanup
   331              self._destroy_container(name)
   332              self._wait_for_destroy(name)
   333  
   334          # return rc and output
   335          return rc, output
   336  
   337      def state(self, name):
   338          """Display the given job's running state."""
   339          systemdActiveStateMap = {
   340              'active': 'up',
   341              'reloading': 'down',
   342              'inactive': 'created',
   343              'failed': 'crashed',
   344              'activating': 'down',
   345              'deactivating': 'down',
   346          }
   347          try:
   348              # NOTE (bacongobbler): this call to ._get_unit() acts as a pre-emptive check to
   349              # determine if the job no longer exists (will raise a RuntimeError on 404)
   350              self._get_unit(name)
   351              state = self._wait_for_container_state(name)
   352              activeState = state['systemdActiveState']
   353              # FIXME (bacongobbler): when fleet loads a job, sometimes it'll automatically start and
   354              # stop the container, which in our case will return as 'failed', even though
   355              # the container is perfectly fine.
   356              if activeState == 'failed' and state['systemdLoadState'] == 'loaded':
   357                  return JobState.created
   358              return getattr(JobState, systemdActiveStateMap[activeState])
   359          except KeyError:
   360              # failed retrieving a proper response from the fleet API
   361              return JobState.error
   362          except RuntimeError:
   363              # failed to retrieve a response from the fleet API,
   364              # which means it does not exist
   365              return JobState.destroyed
   366  
   367  SchedulerClient = FleetHTTPClient
   368  
   369  
   370  CONTAINER_TEMPLATE = [
   371      {"section": "Unit", "name": "Description", "value": "{name}"},
   372      {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; docker pull $IMAGE"'''},  # noqa
   373      {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "docker inspect {name} >/dev/null 2>&1 && docker rm -f {name} || true"'''},  # noqa
   374      {"section": "Service", "name": "ExecStart", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; docker run --name {name} --rm {memory} {cpu} {hostname} -P $IMAGE {command}"'''},  # noqa
   375      {"section": "Service", "name": "ExecStop", "value": '''/usr/bin/docker stop {name}'''},
   376      {"section": "Service", "name": "TimeoutStartSec", "value": "20m"},
   377      {"section": "Service", "name": "TimeoutStopSec", "value": "10"},
   378      {"section": "Service", "name": "RestartSec", "value": "5"},
   379      {"section": "Service", "name": "Restart", "value": "on-failure"},
   380  ]
   381  
   382  
   383  RUN_TEMPLATE = [
   384      {"section": "Unit", "name": "Description", "value": "{name} admin command"},
   385      {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; docker pull $IMAGE"'''},  # noqa
   386      {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "docker inspect {name} >/dev/null 2>&1 && docker rm -f {name} || true"'''},  # noqa
   387      {"section": "Service", "name": "ExecStart", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; docker run --name {name} --entrypoint={entrypoint} -a stdout -a stderr $IMAGE {command}"'''},  # noqa
   388      {"section": "Service", "name": "TimeoutStartSec", "value": "20m"},
   389  ]