github.com/misfo/deis@v1.0.1-0.20141111224634-e0eee0392b8a/controller/scheduler/fleet.py (about)

     1  import cStringIO
     2  import base64
     3  import copy
     4  import json
     5  import httplib
     6  import paramiko
     7  import socket
     8  import re
     9  import time
    10  
    11  
    12  MATCH = re.compile(
    13      '(?P<app>[a-z0-9-]+)_?(?P<version>v[0-9]+)?\.?(?P<c_type>[a-z-_]+)?.(?P<c_num>[0-9]+)')
    14  RETRIES = 3
    15  
    16  
    17  class UHTTPConnection(httplib.HTTPConnection):
    18      """Subclass of Python library HTTPConnection that uses a Unix domain socket.
    19      """
    20  
    21      def __init__(self, path):
    22          httplib.HTTPConnection.__init__(self, 'localhost')
    23          self.path = path
    24  
    25      def connect(self):
    26          sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
    27          sock.connect(self.path)
    28          self.sock = sock
    29  
    30  
    31  class FleetHTTPClient(object):
    32  
    33      def __init__(self, target, auth, options, pkey):
    34          self.target = target
    35          self.auth = auth
    36          self.options = options
    37          self.pkey = pkey
    38          # single global connection
    39          self.conn = UHTTPConnection(self.target)
    40  
    41      # connection helpers
    42  
    43      def _put_unit(self, name, body):
    44          headers = {'Content-Type': 'application/json'}
    45          self.conn.request('PUT', '/v1-alpha/units/{name}.service'.format(**locals()),
    46                            headers=headers, body=json.dumps(body))
    47          resp = self.conn.getresponse()
    48          data = resp.read()
    49          if not 200 <= resp.status <= 299:
    50              errmsg = "Failed to create unit: {} {} - {}".format(
    51                  resp.status, resp.reason, data)
    52              raise RuntimeError(errmsg)
    53          return data
    54  
    55      def _delete_unit(self, name):
    56          headers = {'Content-Type': 'application/json'}
    57          self.conn.request('DELETE', '/v1-alpha/units/{name}.service'.format(**locals()),
    58                            headers=headers)
    59          resp = self.conn.getresponse()
    60          data = resp.read()
    61          if resp.status not in (404, 204):
    62              errmsg = "Failed to delete unit: {} {} - {}".format(
    63                  resp.status, resp.reason, data)
    64              raise RuntimeError(errmsg)
    65          return data
    66  
    67      def _get_state(self, name=None):
    68          headers = {'Content-Type': 'application/json'}
    69          url = '/v1-alpha/state'
    70          if name:
    71              url += '?unitName={name}.service'.format(**locals())
    72          self.conn.request('GET', url, headers=headers)
    73          resp = self.conn.getresponse()
    74          data = resp.read()
    75          if resp.status not in (200,):
    76              errmsg = "Failed to retrieve state: {} {} - {}".format(
    77                  resp.status, resp.reason, data)
    78              raise RuntimeError(errmsg)
    79          return json.loads(data)
    80  
    81      def _get_machines(self):
    82          headers = {'Content-Type': 'application/json'}
    83          url = '/v1-alpha/machines'
    84          self.conn.request('GET', url, headers=headers)
    85          resp = self.conn.getresponse()
    86          data = resp.read()
    87          if resp.status not in (200,):
    88              errmsg = "Failed to retrieve machines: {} {} - {}".format(
    89                  resp.status, resp.reason, data)
    90              raise RuntimeError(errmsg)
    91          return json.loads(data)
    92  
    93      # container api
    94  
    95      def create(self, name, image, command='', template=None, **kwargs):
    96          """Create a container"""
    97          self._create_container(name, image, command,
    98                                 template or copy.deepcopy(CONTAINER_TEMPLATE), **kwargs)
    99  
   100      def _create_container(self, name, image, command, unit, **kwargs):
   101          l = locals().copy()
   102          l.update(re.match(MATCH, name).groupdict())
   103          # prepare memory limit for the container type
   104          mem = kwargs.get('memory', {}).get(l['c_type'], None)
   105          if mem:
   106              l.update({'memory': '-m {}'.format(mem.lower())})
   107          else:
   108              l.update({'memory': ''})
   109          # prepare memory limit for the container type
   110          cpu = kwargs.get('cpu', {}).get(l['c_type'], None)
   111          if cpu:
   112              l.update({'cpu': '-c {}'.format(cpu)})
   113          else:
   114              l.update({'cpu': ''})
   115          # should a special entrypoint be used
   116          entrypoint = kwargs.get('entrypoint')
   117          if entrypoint:
   118              l.update({'entrypoint': '{}'.format(entrypoint)})
   119          # construct unit from template
   120          for f in unit:
   121              f['value'] = f['value'].format(**l)
   122          # prepare tags only if one was provided
   123          tags = kwargs.get('tags', {})
   124          if tags:
   125              tagset = ' '.join(['"{}={}"'.format(k, v) for k, v in tags.items()])
   126              unit.append({"section": "X-Fleet", "name": "MachineMetadata",
   127                           "value": tagset})
   128          # post unit to fleet and retry
   129          for attempt in range(RETRIES):
   130              try:
   131                  self._put_unit(name, {"desiredState": "launched", "options": unit})
   132                  break
   133              except:
   134                  if attempt == (RETRIES - 1):  # account for 0 indexing
   135                      raise
   136  
   137      def start(self, name):
   138          """Start a container"""
   139          self._wait_for_container(name)
   140  
   141      def _wait_for_container(self, name):
   142          failures = 0
   143          # we bump to 20 minutes here to match the timeout on the router and in the app unit files
   144          for _ in range(1200):
   145              states = self._get_state(name)
   146              if states and len(states.get('states', [])) == 1:
   147                  state = states.get('states')[0]
   148                  subState = state.get('systemdSubState')
   149                  if subState == 'running' or subState == 'exited':
   150                      break
   151                  elif subState == 'failed':
   152                      # FIXME: fleet unit state reports failed when containers are fine
   153                      failures += 1
   154                      if failures == 10:
   155                          raise RuntimeError('container failed to start')
   156              time.sleep(1)
   157          else:
   158              raise RuntimeError('container timeout on start')
   159  
   160      def _wait_for_destroy(self, name):
   161          for _ in range(30):
   162              states = self._get_state(name)
   163              if not states:
   164                  break
   165              time.sleep(1)
   166          else:
   167              raise RuntimeError('timeout on container destroy')
   168  
   169      def stop(self, name):
   170          """Stop a container"""
   171          raise NotImplementedError
   172  
   173      def destroy(self, name):
   174          """Destroy a container"""
   175          # call all destroy functions, ignoring any errors
   176          try:
   177              self._destroy_container(name)
   178          except:
   179              pass
   180          self._wait_for_destroy(name)
   181  
   182      def _destroy_container(self, name):
   183          for attempt in range(RETRIES):
   184              try:
   185                  self._delete_unit(name)
   186                  break
   187              except:
   188                  if attempt == (RETRIES - 1):  # account for 0 indexing
   189                      raise
   190  
   191      def run(self, name, image, entrypoint, command):  # noqa
   192          """Run a one-off command"""
   193          self._create_container(name, image, command, copy.deepcopy(RUN_TEMPLATE),
   194                                 entrypoint=entrypoint)
   195  
   196          # wait for the container to get scheduled
   197          for _ in range(30):
   198              states = self._get_state(name)
   199              if states and len(states.get('states', [])) == 1:
   200                  state = states.get('states')[0]
   201                  break
   202              time.sleep(1)
   203          else:
   204              raise RuntimeError('container did not report state')
   205          machineID = state.get('machineID')
   206  
   207          # find the machine
   208          machines = self._get_machines()
   209          if not machines:
   210              raise RuntimeError('no available hosts to run command')
   211  
   212          # find the machine's primaryIP
   213          primaryIP = None
   214          for m in machines.get('machines', []):
   215              if m['id'] == machineID:
   216                  primaryIP = m['primaryIP']
   217          if not primaryIP:
   218              raise RuntimeError('could not find host')
   219  
   220          # prepare ssh key
   221          file_obj = cStringIO.StringIO(base64.b64decode(self.pkey))
   222          pkey = paramiko.RSAKey(file_obj=file_obj)
   223  
   224          # grab output via docker logs over SSH
   225          ssh = paramiko.SSHClient()
   226          ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
   227          ssh.connect(primaryIP, username="core", pkey=pkey)
   228          # share a transport
   229          tran = ssh.get_transport()
   230  
   231          def _do_ssh(cmd):
   232              chan = tran.open_session()
   233              # get a pty so stdout/stderr look right
   234              chan.get_pty()
   235              out = chan.makefile()
   236              chan.exec_command(cmd)
   237              rc, output = chan.recv_exit_status(), out.read()
   238              return rc, output
   239  
   240          # wait for container to start
   241          for _ in range(1200):
   242              rc, _ = _do_ssh('docker inspect {name}'.format(**locals()))
   243              if rc == 0:
   244                  break
   245              time.sleep(1)
   246          else:
   247              raise RuntimeError('container failed to start on host')
   248  
   249          # wait for container to complete
   250          for _ in range(1200):
   251              _rc, _output = _do_ssh('docker inspect {name}'.format(**locals()))
   252              if _rc != 0:
   253                  raise RuntimeError('failed to inspect container')
   254              _container = json.loads(_output)
   255              finished_at = _container[0]["State"]["FinishedAt"]
   256              if not finished_at.startswith('0001'):
   257                  break
   258              time.sleep(1)
   259          else:
   260              raise RuntimeError('container timed out')
   261  
   262          # gather container output
   263          _rc, output = _do_ssh('docker logs {name}'.format(**locals()))
   264          if _rc != 0:
   265              raise RuntimeError('could not attach to container')
   266  
   267          # determine container exit code
   268          _rc, _output = _do_ssh('docker inspect {name}'.format(**locals()))
   269          if _rc != 0:
   270              raise RuntimeError('could not determine exit code')
   271          container = json.loads(_output)
   272          rc = container[0]["State"]["ExitCode"]
   273  
   274          # cleanup
   275          self._destroy_container(name)
   276          self._wait_for_destroy(name)
   277  
   278          # return rc and output
   279          return rc, output
   280  
   281      def attach(self, name):
   282          """
   283          Attach to a job's stdin, stdout and stderr
   284          """
   285          raise NotImplementedError
   286  
   287  SchedulerClient = FleetHTTPClient
   288  
   289  
   290  CONTAINER_TEMPLATE = [
   291      {"section": "Unit", "name": "Description", "value": "{name}"},
   292      {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; docker pull $IMAGE"'''},  # noqa
   293      {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "docker inspect {name} >/dev/null 2>&1 && docker rm -f {name} || true"'''},  # noqa
   294      {"section": "Service", "name": "ExecStart", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; port=$(docker inspect -f '{{{{range $k, $v := .ContainerConfig.ExposedPorts }}}}{{{{$k}}}}{{{{end}}}}' $IMAGE | cut -d/ -f1) ; docker run --name {name} {memory} {cpu} -P -e PORT=$port $IMAGE {command}"'''},  # noqa
   295      {"section": "Service", "name": "ExecStop", "value": '''/usr/bin/docker rm -f {name}'''},
   296      {"section": "Service", "name": "TimeoutStartSec", "value": "20m"},
   297      {"section": "Service", "name": "RestartSec", "value": "5"},
   298      {"section": "Service", "name": "Restart", "value": "on-failure"},
   299  ]
   300  
   301  
   302  RUN_TEMPLATE = [
   303      {"section": "Unit", "name": "Description", "value": "{name} admin command"},
   304      {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; docker pull $IMAGE"'''},  # noqa
   305      {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "docker inspect {name} >/dev/null 2>&1 && docker rm -f {name} || true"'''},  # noqa
   306      {"section": "Service", "name": "ExecStart", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; docker run --name {name} --entrypoint={entrypoint} -a stdout -a stderr $IMAGE {command}"'''},  # noqa
   307      {"section": "Service", "name": "TimeoutStartSec", "value": "20m"},
   308  ]