github.com/dustinrc/deis@v1.10.1-0.20150917223407-0894a5fb979e/controller/scheduler/fleet.py (about) 1 import base64 2 import copy 3 import cStringIO 4 import httplib 5 import json 6 import paramiko 7 import re 8 import socket 9 import time 10 11 from django.conf import settings 12 13 from . import AbstractSchedulerClient 14 from .states import JobState 15 16 17 MATCH = re.compile( 18 '(?P<app>[a-z0-9-]+)_?(?P<version>v[0-9]+)?\.?(?P<c_type>[a-z-_]+)?.(?P<c_num>[0-9]+)') 19 RETRIES = 3 20 21 22 class UHTTPConnection(httplib.HTTPConnection): 23 """Subclass of Python library HTTPConnection that uses a Unix domain socket. 24 """ 25 26 def __init__(self, path): 27 httplib.HTTPConnection.__init__(self, 'localhost') 28 self.path = path 29 30 def connect(self): 31 sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) 32 sock.connect(self.path) 33 self.sock = sock 34 35 36 class FleetHTTPClient(AbstractSchedulerClient): 37 38 def __init__(self, target, auth, options, pkey): 39 super(FleetHTTPClient, self).__init__(target, auth, options, pkey) 40 # single global connection 41 self.conn = UHTTPConnection(self.target) 42 43 # connection helpers 44 45 def _request_unit(self, method, name, body=None): 46 headers = {'Content-Type': 'application/json'} 47 self.conn.request(method, '/v1-alpha/units/{name}.service'.format(**locals()), 48 headers=headers, body=json.dumps(body)) 49 return self.conn.getresponse() 50 51 def _get_unit(self, name): 52 for attempt in xrange(RETRIES): 53 try: 54 resp = self._request_unit('GET', name) 55 data = resp.read() 56 if not 200 <= resp.status <= 299: 57 errmsg = "Failed to retrieve unit: {} {} - {}".format( 58 resp.status, resp.reason, data) 59 raise RuntimeError(errmsg) 60 return data 61 except: 62 if attempt >= (RETRIES - 1): 63 raise 64 65 def _put_unit(self, name, body): 66 for attempt in xrange(RETRIES): 67 try: 68 resp = self._request_unit('PUT', name, body) 69 data = resp.read() 70 if not 200 <= resp.status <= 299: 71 errmsg = "Failed to create unit: {} {} - {}".format( 72 resp.status, resp.reason, data) 73 raise RuntimeError(errmsg) 74 return data 75 except: 76 if attempt >= (RETRIES - 1): 77 raise 78 79 def _delete_unit(self, name): 80 headers = {'Content-Type': 'application/json'} 81 self.conn.request('DELETE', '/v1-alpha/units/{name}.service'.format(**locals()), 82 headers=headers) 83 resp = self.conn.getresponse() 84 data = resp.read() 85 if resp.status not in (404, 204): 86 errmsg = "Failed to delete unit: {} {} - {}".format( 87 resp.status, resp.reason, data) 88 raise RuntimeError(errmsg) 89 return data 90 91 def _get_state(self, name=None): 92 headers = {'Content-Type': 'application/json'} 93 url = '/v1-alpha/state' 94 if name: 95 url += '?unitName={name}.service'.format(**locals()) 96 self.conn.request('GET', url, headers=headers) 97 resp = self.conn.getresponse() 98 data = resp.read() 99 if resp.status not in (200,): 100 errmsg = "Failed to retrieve state: {} {} - {}".format( 101 resp.status, resp.reason, data) 102 raise RuntimeError(errmsg) 103 return json.loads(data) 104 105 def _get_machines(self): 106 headers = {'Content-Type': 'application/json'} 107 url = '/v1-alpha/machines' 108 self.conn.request('GET', url, headers=headers) 109 resp = self.conn.getresponse() 110 data = resp.read() 111 if resp.status not in (200,): 112 errmsg = "Failed to retrieve machines: {} {} - {}".format( 113 resp.status, resp.reason, data) 114 raise RuntimeError(errmsg) 115 return json.loads(data) 116 117 # container api 118 119 def create(self, name, image, command='', template=None, **kwargs): 120 """Create a container.""" 121 self._create_container(name, image, command, 122 template or copy.deepcopy(CONTAINER_TEMPLATE), **kwargs) 123 124 def _create_container(self, name, image, command, unit, **kwargs): 125 l = locals().copy() 126 l.update(re.match(MATCH, name).groupdict()) 127 # prepare memory limit for the container type 128 mem = kwargs.get('memory', {}).get(l['c_type'], None) 129 if mem: 130 l.update({'memory': '-m {}'.format(mem.lower())}) 131 else: 132 l.update({'memory': ''}) 133 # prepare memory limit for the container type 134 cpu = kwargs.get('cpu', {}).get(l['c_type'], None) 135 if cpu: 136 l.update({'cpu': '-c {}'.format(cpu)}) 137 else: 138 l.update({'cpu': ''}) 139 # set unit hostname 140 l.update({'hostname': self._get_hostname(name)}) 141 # should a special entrypoint be used 142 entrypoint = kwargs.get('entrypoint') 143 if entrypoint: 144 l.update({'entrypoint': '{}'.format(entrypoint)}) 145 # encode command as utf-8 146 if isinstance(l.get('command'), basestring): 147 l['command'] = l['command'].encode('utf-8') 148 # construct unit from template 149 for f in unit: 150 f['value'] = f['value'].format(**l) 151 # prepare tags only if one was provided 152 tags = kwargs.get('tags', {}) 153 tagset = ' '.join(['"{}={}"'.format(k, v) for k, v in tags.viewitems()]) 154 if settings.ENABLE_PLACEMENT_OPTIONS in ['true', 'True', 'TRUE', '1']: 155 unit.append({"section": "X-Fleet", "name": "MachineMetadata", 156 "value": tagset + ' "dataPlane=true"'}) 157 # post unit to fleet 158 self._put_unit(name, {"desiredState": "loaded", "options": unit}) 159 160 def _get_hostname(self, application_name): 161 hostname = settings.UNIT_HOSTNAME 162 if hostname == "default": 163 return '' 164 elif hostname == "application": 165 # replace underscore with dots, since underscore is not valid in DNS hostnames 166 dns_name = application_name.replace("_", ".") 167 return '-h ' + dns_name 168 elif hostname == "server": 169 return '-h %H' 170 else: 171 raise RuntimeError('Unsupported hostname: ' + hostname) 172 173 def start(self, name): 174 """Start a container.""" 175 self._put_unit(name, {'desiredState': 'launched'}) 176 self._wait_for_container_running(name) 177 178 def _wait_for_container_state(self, name): 179 # wait for container to get scheduled 180 for _ in xrange(30): 181 states = self._get_state(name) 182 if states and len(states.get('states', [])) == 1: 183 return states.get('states')[0] 184 time.sleep(1) 185 else: 186 raise RuntimeError('container timeout while retrieving state') 187 188 def _wait_for_container_running(self, name): 189 # we bump to 20 minutes here to match the timeout on the router and in the app unit files 190 try: 191 self._wait_for_job_state(name, JobState.up) 192 except RuntimeError: 193 raise RuntimeError('container failed to start') 194 195 def _wait_for_job_state(self, name, state): 196 # we bump to 20 minutes here to match the timeout on the router and in the app unit files 197 for _ in xrange(1200): 198 if self.state(name) == state: 199 return 200 time.sleep(1) 201 else: 202 raise RuntimeError('timeout waiting for job state: {}'.format(state)) 203 204 def _wait_for_destroy(self, name): 205 for _ in xrange(30): 206 if not self._get_state(name): 207 break 208 time.sleep(1) 209 else: 210 raise RuntimeError('timeout on container destroy') 211 212 def stop(self, name): 213 """Stop a container.""" 214 self._put_unit(name, {"desiredState": "loaded"}) 215 self._wait_for_job_state(name, JobState.created) 216 217 def destroy(self, name): 218 """Destroy a container.""" 219 # call all destroy functions, ignoring any errors 220 try: 221 self._destroy_container(name) 222 except: 223 pass 224 self._wait_for_destroy(name) 225 226 def _destroy_container(self, name): 227 for attempt in xrange(RETRIES): 228 try: 229 self._delete_unit(name) 230 break 231 except: 232 if attempt == (RETRIES - 1): # account for 0 indexing 233 raise 234 235 def run(self, name, image, entrypoint, command): # noqa 236 """Run a one-off command.""" 237 self._create_container(name, image, command, copy.deepcopy(RUN_TEMPLATE), 238 entrypoint=entrypoint) 239 # launch the container 240 self._put_unit(name, {'desiredState': 'launched'}) 241 # wait for the container to get scheduled 242 state = self._wait_for_container_state(name) 243 244 try: 245 machineID = state.get('machineID') 246 247 # find the machine 248 machines = self._get_machines() 249 if not machines: 250 raise RuntimeError('no available hosts to run command') 251 252 # find the machine's primaryIP 253 primaryIP = None 254 for m in machines.get('machines', []): 255 if m['id'] == machineID: 256 primaryIP = m['primaryIP'] 257 if not primaryIP: 258 raise RuntimeError('could not find host') 259 260 # prepare ssh key 261 file_obj = cStringIO.StringIO(base64.b64decode(self.pkey)) 262 pkey = paramiko.RSAKey(file_obj=file_obj) 263 264 # grab output via docker logs over SSH 265 ssh = paramiko.SSHClient() 266 ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) 267 ssh.connect(primaryIP, username="core", pkey=pkey) 268 # share a transport 269 tran = ssh.get_transport() 270 271 def _do_ssh(cmd): 272 with tran.open_session() as chan: 273 chan.exec_command(cmd) 274 while not chan.exit_status_ready(): 275 time.sleep(1) 276 out = chan.makefile() 277 output = out.read() 278 rc = chan.recv_exit_status() 279 return rc, output 280 281 # wait for container to launch 282 # we loop indefinitely here, as we have no idea how long the docker pull will take 283 while True: 284 rc, _ = _do_ssh('docker inspect {name}'.format(**locals())) 285 if rc == 0: 286 break 287 time.sleep(1) 288 else: 289 raise RuntimeError('failed to create container') 290 291 # wait for container to start 292 for _ in xrange(2): 293 _rc, _output = _do_ssh('docker inspect {name}'.format(**locals())) 294 if _rc != 0: 295 raise RuntimeError('failed to inspect container') 296 _container = json.loads(_output) 297 started_at = _container[0]["State"]["StartedAt"] 298 if not started_at.startswith('0001'): 299 break 300 time.sleep(1) 301 else: 302 raise RuntimeError('container failed to start') 303 304 # wait for container to complete 305 for _ in xrange(1200): 306 _rc, _output = _do_ssh('docker inspect {name}'.format(**locals())) 307 if _rc != 0: 308 raise RuntimeError('failed to inspect container') 309 _container = json.loads(_output) 310 finished_at = _container[0]["State"]["FinishedAt"] 311 if not finished_at.startswith('0001'): 312 break 313 time.sleep(1) 314 else: 315 raise RuntimeError('container timed out') 316 317 # gather container output 318 _rc, output = _do_ssh('docker logs {name}'.format(**locals())) 319 if _rc != 0: 320 raise RuntimeError('could not attach to container') 321 322 # determine container exit code 323 _rc, _output = _do_ssh('docker inspect {name}'.format(**locals())) 324 if _rc != 0: 325 raise RuntimeError('could not determine exit code') 326 container = json.loads(_output) 327 rc = container[0]["State"]["ExitCode"] 328 329 finally: 330 # cleanup 331 self._destroy_container(name) 332 self._wait_for_destroy(name) 333 334 # return rc and output 335 return rc, output 336 337 def state(self, name): 338 """Display the given job's running state.""" 339 systemdActiveStateMap = { 340 'active': 'up', 341 'reloading': 'down', 342 'inactive': 'created', 343 'failed': 'crashed', 344 'activating': 'down', 345 'deactivating': 'down', 346 } 347 try: 348 # NOTE (bacongobbler): this call to ._get_unit() acts as a pre-emptive check to 349 # determine if the job no longer exists (will raise a RuntimeError on 404) 350 self._get_unit(name) 351 state = self._wait_for_container_state(name) 352 activeState = state['systemdActiveState'] 353 # FIXME (bacongobbler): when fleet loads a job, sometimes it'll automatically start and 354 # stop the container, which in our case will return as 'failed', even though 355 # the container is perfectly fine. 356 if activeState == 'failed' and state['systemdLoadState'] == 'loaded': 357 return JobState.created 358 return getattr(JobState, systemdActiveStateMap[activeState]) 359 except KeyError: 360 # failed retrieving a proper response from the fleet API 361 return JobState.error 362 except RuntimeError: 363 # failed to retrieve a response from the fleet API, 364 # which means it does not exist 365 return JobState.destroyed 366 367 SchedulerClient = FleetHTTPClient 368 369 370 CONTAINER_TEMPLATE = [ 371 {"section": "Unit", "name": "Description", "value": "{name}"}, 372 {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; docker pull $IMAGE"'''}, # noqa 373 {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "docker inspect {name} >/dev/null 2>&1 && docker rm -f {name} || true"'''}, # noqa 374 {"section": "Service", "name": "ExecStart", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; docker run --name {name} --rm {memory} {cpu} {hostname} -P $IMAGE {command}"'''}, # noqa 375 {"section": "Service", "name": "ExecStop", "value": '''/usr/bin/docker stop {name}'''}, 376 {"section": "Service", "name": "TimeoutStartSec", "value": "20m"}, 377 {"section": "Service", "name": "TimeoutStopSec", "value": "10"}, 378 {"section": "Service", "name": "RestartSec", "value": "5"}, 379 {"section": "Service", "name": "Restart", "value": "on-failure"}, 380 ] 381 382 383 RUN_TEMPLATE = [ 384 {"section": "Unit", "name": "Description", "value": "{name} admin command"}, 385 {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; docker pull $IMAGE"'''}, # noqa 386 {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "docker inspect {name} >/dev/null 2>&1 && docker rm -f {name} || true"'''}, # noqa 387 {"section": "Service", "name": "ExecStart", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; docker run --name {name} --entrypoint={entrypoint} -a stdout -a stderr $IMAGE {command}"'''}, # noqa 388 {"section": "Service", "name": "TimeoutStartSec", "value": "20m"}, 389 ]