github.com/rvaralda/deis@v1.4.1/controller/scheduler/fleet.py (about)

     1  import cStringIO
     2  import base64
     3  import copy
     4  import httplib
     5  import json
     6  import paramiko
     7  import socket
     8  import re
     9  import time
    10  
    11  from django.conf import settings
    12  
    13  from .states import JobState
    14  
    15  
    16  MATCH = re.compile(
    17      '(?P<app>[a-z0-9-]+)_?(?P<version>v[0-9]+)?\.?(?P<c_type>[a-z-_]+)?.(?P<c_num>[0-9]+)')
    18  RETRIES = 3
    19  
    20  
    21  class UHTTPConnection(httplib.HTTPConnection):
    22      """Subclass of Python library HTTPConnection that uses a Unix domain socket.
    23      """
    24  
    25      def __init__(self, path):
    26          httplib.HTTPConnection.__init__(self, 'localhost')
    27          self.path = path
    28  
    29      def connect(self):
    30          sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
    31          sock.connect(self.path)
    32          self.sock = sock
    33  
    34  
    35  class FleetHTTPClient(object):
    36  
    37      def __init__(self, target, auth, options, pkey):
    38          self.target = target
    39          self.auth = auth
    40          self.options = options
    41          self.pkey = pkey
    42          # single global connection
    43          self.conn = UHTTPConnection(self.target)
    44  
    45      # connection helpers
    46  
    47      def _request_unit(self, method, name, body=None):
    48          headers = {'Content-Type': 'application/json'}
    49          self.conn.request(method, '/v1-alpha/units/{name}.service'.format(**locals()),
    50                                    headers=headers, body=json.dumps(body))
    51          return self.conn.getresponse()
    52  
    53      def _get_unit(self, name):
    54          for attempt in range(RETRIES):
    55              try:
    56                  resp = self._request_unit('GET', name)
    57                  data = resp.read()
    58                  if not 200 <= resp.status <= 299:
    59                      errmsg = "Failed to retrieve unit: {} {} - {}".format(
    60                          resp.status, resp.reason, data)
    61                      raise RuntimeError(errmsg)
    62                  return data
    63              except:
    64                  if attempt >= (RETRIES - 1):
    65                      raise
    66  
    67      def _put_unit(self, name, body):
    68          for attempt in range(RETRIES):
    69              try:
    70                  resp = self._request_unit('PUT', name, body)
    71                  data = resp.read()
    72                  if not 200 <= resp.status <= 299:
    73                      errmsg = "Failed to create unit: {} {} - {}".format(
    74                          resp.status, resp.reason, data)
    75                      raise RuntimeError(errmsg)
    76                  return data
    77              except:
    78                  if attempt >= (RETRIES - 1):
    79                      raise
    80  
    81      def _delete_unit(self, name):
    82          headers = {'Content-Type': 'application/json'}
    83          self.conn.request('DELETE', '/v1-alpha/units/{name}.service'.format(**locals()),
    84                            headers=headers)
    85          resp = self.conn.getresponse()
    86          data = resp.read()
    87          if resp.status not in (404, 204):
    88              errmsg = "Failed to delete unit: {} {} - {}".format(
    89                  resp.status, resp.reason, data)
    90              raise RuntimeError(errmsg)
    91          return data
    92  
    93      def _get_state(self, name=None):
    94          headers = {'Content-Type': 'application/json'}
    95          url = '/v1-alpha/state'
    96          if name:
    97              url += '?unitName={name}.service'.format(**locals())
    98          self.conn.request('GET', url, headers=headers)
    99          resp = self.conn.getresponse()
   100          data = resp.read()
   101          if resp.status not in (200,):
   102              errmsg = "Failed to retrieve state: {} {} - {}".format(
   103                  resp.status, resp.reason, data)
   104              raise RuntimeError(errmsg)
   105          return json.loads(data)
   106  
   107      def _get_machines(self):
   108          headers = {'Content-Type': 'application/json'}
   109          url = '/v1-alpha/machines'
   110          self.conn.request('GET', url, headers=headers)
   111          resp = self.conn.getresponse()
   112          data = resp.read()
   113          if resp.status not in (200,):
   114              errmsg = "Failed to retrieve machines: {} {} - {}".format(
   115                  resp.status, resp.reason, data)
   116              raise RuntimeError(errmsg)
   117          return json.loads(data)
   118  
   119      # container api
   120  
   121      def create(self, name, image, command='', template=None, **kwargs):
   122          """Create a container"""
   123          self._create_container(name, image, command,
   124                                 template or copy.deepcopy(CONTAINER_TEMPLATE), **kwargs)
   125  
   126      def _create_container(self, name, image, command, unit, **kwargs):
   127          l = locals().copy()
   128          l.update(re.match(MATCH, name).groupdict())
   129          # prepare memory limit for the container type
   130          mem = kwargs.get('memory', {}).get(l['c_type'], None)
   131          if mem:
   132              l.update({'memory': '-m {}'.format(mem.lower())})
   133          else:
   134              l.update({'memory': ''})
   135          # prepare memory limit for the container type
   136          cpu = kwargs.get('cpu', {}).get(l['c_type'], None)
   137          if cpu:
   138              l.update({'cpu': '-c {}'.format(cpu)})
   139          else:
   140              l.update({'cpu': ''})
   141          # set unit hostname
   142          l.update({'hostname': self._get_hostname(name)})
   143          # should a special entrypoint be used
   144          entrypoint = kwargs.get('entrypoint')
   145          if entrypoint:
   146              l.update({'entrypoint': '{}'.format(entrypoint)})
   147          # encode command as utf-8
   148          if isinstance(l.get('command'), basestring):
   149              l['command'] = l['command'].encode('utf-8')
   150          # construct unit from template
   151          for f in unit:
   152              f['value'] = f['value'].format(**l)
   153          # prepare tags only if one was provided
   154          tags = kwargs.get('tags', {})
   155          if tags:
   156              tagset = ' '.join(['"{}={}"'.format(k, v) for k, v in tags.items()])
   157              unit.append({"section": "X-Fleet", "name": "MachineMetadata",
   158                           "value": tagset})
   159          # post unit to fleet
   160          self._put_unit(name, {"desiredState": "loaded", "options": unit})
   161  
   162      def _get_hostname(self, application_name):
   163          hostname = settings.UNIT_HOSTNAME
   164          if hostname == "default":
   165              return ''
   166          elif hostname == "application":
   167              # replace underscore with dots, since underscore is not valid in DNS hostnames
   168              dns_name = application_name.replace("_", ".")
   169              return '-h ' + dns_name
   170          elif hostname == "server":
   171              return '-h %H'
   172          else:
   173              raise RuntimeError('Unsupported hostname: ' + hostname)
   174  
   175      def start(self, name):
   176          """Start a container"""
   177          self._put_unit(name, {'desiredState': 'launched'})
   178          self._wait_for_container_running(name)
   179  
   180      def _wait_for_container_state(self, name):
   181          # wait for container to get scheduled
   182          for _ in range(30):
   183              states = self._get_state(name)
   184              if states and len(states.get('states', [])) == 1:
   185                  return states.get('states')[0]
   186              time.sleep(1)
   187          else:
   188              raise RuntimeError('container timeout while retrieving state')
   189  
   190      def _wait_for_container_running(self, name):
   191          # we bump to 20 minutes here to match the timeout on the router and in the app unit files
   192          for _ in range(1200):
   193              if self.state(name) == JobState.up:
   194                  return
   195              time.sleep(1)
   196          else:
   197              raise RuntimeError('container failed to start')
   198  
   199      def _wait_for_destroy(self, name):
   200          for _ in range(30):
   201              if not self._get_state(name):
   202                  break
   203              time.sleep(1)
   204          else:
   205              raise RuntimeError('timeout on container destroy')
   206  
   207      def stop(self, name):
   208          """Stop a container"""
   209          raise NotImplementedError
   210  
   211      def destroy(self, name):
   212          """Destroy a container"""
   213          # call all destroy functions, ignoring any errors
   214          try:
   215              self._destroy_container(name)
   216          except:
   217              pass
   218          self._wait_for_destroy(name)
   219  
   220      def _destroy_container(self, name):
   221          for attempt in range(RETRIES):
   222              try:
   223                  self._delete_unit(name)
   224                  break
   225              except:
   226                  if attempt == (RETRIES - 1):  # account for 0 indexing
   227                      raise
   228  
   229      def run(self, name, image, entrypoint, command):  # noqa
   230          """Run a one-off command"""
   231          self._create_container(name, image, command, copy.deepcopy(RUN_TEMPLATE),
   232                                 entrypoint=entrypoint)
   233          # launch the container
   234          self._put_unit(name, {'desiredState': 'launched'})
   235          # wait for the container to get scheduled
   236          state = self._wait_for_container_state(name)
   237  
   238          try:
   239              machineID = state.get('machineID')
   240  
   241              # find the machine
   242              machines = self._get_machines()
   243              if not machines:
   244                  raise RuntimeError('no available hosts to run command')
   245  
   246              # find the machine's primaryIP
   247              primaryIP = None
   248              for m in machines.get('machines', []):
   249                  if m['id'] == machineID:
   250                      primaryIP = m['primaryIP']
   251              if not primaryIP:
   252                  raise RuntimeError('could not find host')
   253  
   254              # prepare ssh key
   255              file_obj = cStringIO.StringIO(base64.b64decode(self.pkey))
   256              pkey = paramiko.RSAKey(file_obj=file_obj)
   257  
   258              # grab output via docker logs over SSH
   259              ssh = paramiko.SSHClient()
   260              ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
   261              ssh.connect(primaryIP, username="core", pkey=pkey)
   262              # share a transport
   263              tran = ssh.get_transport()
   264  
   265              def _do_ssh(cmd):
   266                  chan = tran.open_session()
   267                  # get a pty so stdout/stderr look right
   268                  chan.get_pty()
   269                  out = chan.makefile()
   270                  chan.exec_command(cmd)
   271                  rc, output = chan.recv_exit_status(), out.read()
   272                  return rc, output
   273  
   274              # wait for container to launch
   275              # we loop indefinitely here, as we have no idea how long the docker pull will take
   276              while True:
   277                  rc, _ = _do_ssh('docker inspect {name}'.format(**locals()))
   278                  if rc == 0:
   279                      break
   280                  time.sleep(1)
   281              else:
   282                  raise RuntimeError('failed to create container')
   283  
   284              # wait for container to start
   285              for _ in range(2):
   286                  _rc, _output = _do_ssh('docker inspect {name}'.format(**locals()))
   287                  if _rc != 0:
   288                      raise RuntimeError('failed to inspect container')
   289                  _container = json.loads(_output)
   290                  started_at = _container[0]["State"]["StartedAt"]
   291                  if not started_at.startswith('0001'):
   292                      break
   293                  time.sleep(1)
   294              else:
   295                  raise RuntimeError('container failed to start')
   296  
   297              # wait for container to complete
   298              for _ in range(1200):
   299                  _rc, _output = _do_ssh('docker inspect {name}'.format(**locals()))
   300                  if _rc != 0:
   301                      raise RuntimeError('failed to inspect container')
   302                  _container = json.loads(_output)
   303                  finished_at = _container[0]["State"]["FinishedAt"]
   304                  if not finished_at.startswith('0001'):
   305                      break
   306                  time.sleep(1)
   307              else:
   308                  raise RuntimeError('container timed out')
   309  
   310              # gather container output
   311              _rc, output = _do_ssh('docker logs {name}'.format(**locals()))
   312              if _rc != 0:
   313                  raise RuntimeError('could not attach to container')
   314  
   315              # determine container exit code
   316              _rc, _output = _do_ssh('docker inspect {name}'.format(**locals()))
   317              if _rc != 0:
   318                  raise RuntimeError('could not determine exit code')
   319              container = json.loads(_output)
   320              rc = container[0]["State"]["ExitCode"]
   321  
   322          finally:
   323              # cleanup
   324              self._destroy_container(name)
   325              self._wait_for_destroy(name)
   326  
   327          # return rc and output
   328          return rc, output
   329  
   330      def state(self, name):
   331          systemdActiveStateMap = {
   332              "active": "up",
   333              "reloading": "down",
   334              "inactive": "created",
   335              "failed": "crashed",
   336              "activating": "down",
   337              "deactivating": "down",
   338          }
   339          try:
   340              # NOTE (bacongobbler): this call to ._get_unit() also acts as a pre-emptive check to
   341              # determine if the job no longer exists (will raise a RuntimeError on 404)
   342              unit = self._get_unit(name)
   343              state = self._wait_for_container_state(name)
   344              activeState = state['systemdActiveState']
   345              # FIXME (bacongobbler): when fleet loads a job, sometimes it'll automatically start and
   346              # stop the container, which in our case will return as 'failed', even though
   347              # the container is perfectly fine.
   348              if activeState == 'failed':
   349                  if json.loads(unit)['currentState'] == 'loaded':
   350                      return JobState.created
   351              return getattr(JobState, systemdActiveStateMap[activeState])
   352          except KeyError:
   353              # failed retrieving a proper response from the fleet API
   354              return JobState.error
   355          except RuntimeError:
   356              # failed to retrieve a response from the fleet API,
   357              # which means it does not exist
   358              return JobState.destroyed
   359  
   360      def attach(self, name):
   361          """
   362          Attach to a job's stdin, stdout and stderr
   363          """
   364          raise NotImplementedError
   365  
   366  SchedulerClient = FleetHTTPClient
   367  
   368  
   369  CONTAINER_TEMPLATE = [
   370      {"section": "Unit", "name": "Description", "value": "{name}"},
   371      {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; docker pull $IMAGE"'''},  # noqa
   372      {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "docker inspect {name} >/dev/null 2>&1 && docker rm -f {name} || true"'''},  # noqa
   373      {"section": "Service", "name": "ExecStart", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; port=$(docker inspect -f '{{{{range $k, $v := .ContainerConfig.ExposedPorts }}}}{{{{$k}}}}{{{{end}}}}' $IMAGE | cut -d/ -f1) ; docker run --name {name} {memory} {cpu} {hostname} -P -e PORT=$port $IMAGE {command}"'''},  # noqa
   374      {"section": "Service", "name": "ExecStop", "value": '''/usr/bin/docker stop {name}'''},
   375      {"section": "Service", "name": "ExecStop", "value": '''/usr/bin/docker rm -f {name}'''},
   376      {"section": "Service", "name": "TimeoutStartSec", "value": "20m"},
   377      {"section": "Service", "name": "TimeoutStopSec", "value": "10"},
   378      {"section": "Service", "name": "RestartSec", "value": "5"},
   379      {"section": "Service", "name": "Restart", "value": "on-failure"},
   380  ]
   381  
   382  
   383  RUN_TEMPLATE = [
   384      {"section": "Unit", "name": "Description", "value": "{name} admin command"},
   385      {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; docker pull $IMAGE"'''},  # noqa
   386      {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "docker inspect {name} >/dev/null 2>&1 && docker rm -f {name} || true"'''},  # noqa
   387      {"section": "Service", "name": "ExecStart", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; docker run --name {name} --entrypoint={entrypoint} -a stdout -a stderr $IMAGE {command}"'''},  # noqa
   388      {"section": "Service", "name": "TimeoutStartSec", "value": "20m"},
   389  ]