github.com/rvaralda/deis@v1.4.1/controller/scheduler/fleet.py (about) 1 import cStringIO 2 import base64 3 import copy 4 import httplib 5 import json 6 import paramiko 7 import socket 8 import re 9 import time 10 11 from django.conf import settings 12 13 from .states import JobState 14 15 16 MATCH = re.compile( 17 '(?P<app>[a-z0-9-]+)_?(?P<version>v[0-9]+)?\.?(?P<c_type>[a-z-_]+)?.(?P<c_num>[0-9]+)') 18 RETRIES = 3 19 20 21 class UHTTPConnection(httplib.HTTPConnection): 22 """Subclass of Python library HTTPConnection that uses a Unix domain socket. 23 """ 24 25 def __init__(self, path): 26 httplib.HTTPConnection.__init__(self, 'localhost') 27 self.path = path 28 29 def connect(self): 30 sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) 31 sock.connect(self.path) 32 self.sock = sock 33 34 35 class FleetHTTPClient(object): 36 37 def __init__(self, target, auth, options, pkey): 38 self.target = target 39 self.auth = auth 40 self.options = options 41 self.pkey = pkey 42 # single global connection 43 self.conn = UHTTPConnection(self.target) 44 45 # connection helpers 46 47 def _request_unit(self, method, name, body=None): 48 headers = {'Content-Type': 'application/json'} 49 self.conn.request(method, '/v1-alpha/units/{name}.service'.format(**locals()), 50 headers=headers, body=json.dumps(body)) 51 return self.conn.getresponse() 52 53 def _get_unit(self, name): 54 for attempt in range(RETRIES): 55 try: 56 resp = self._request_unit('GET', name) 57 data = resp.read() 58 if not 200 <= resp.status <= 299: 59 errmsg = "Failed to retrieve unit: {} {} - {}".format( 60 resp.status, resp.reason, data) 61 raise RuntimeError(errmsg) 62 return data 63 except: 64 if attempt >= (RETRIES - 1): 65 raise 66 67 def _put_unit(self, name, body): 68 for attempt in range(RETRIES): 69 try: 70 resp = self._request_unit('PUT', name, body) 71 data = resp.read() 72 if not 200 <= resp.status <= 299: 73 errmsg = "Failed to create unit: {} {} - {}".format( 74 resp.status, resp.reason, data) 75 raise RuntimeError(errmsg) 76 return data 77 except: 78 if attempt >= (RETRIES - 1): 79 raise 80 81 def _delete_unit(self, name): 82 headers = {'Content-Type': 'application/json'} 83 self.conn.request('DELETE', '/v1-alpha/units/{name}.service'.format(**locals()), 84 headers=headers) 85 resp = self.conn.getresponse() 86 data = resp.read() 87 if resp.status not in (404, 204): 88 errmsg = "Failed to delete unit: {} {} - {}".format( 89 resp.status, resp.reason, data) 90 raise RuntimeError(errmsg) 91 return data 92 93 def _get_state(self, name=None): 94 headers = {'Content-Type': 'application/json'} 95 url = '/v1-alpha/state' 96 if name: 97 url += '?unitName={name}.service'.format(**locals()) 98 self.conn.request('GET', url, headers=headers) 99 resp = self.conn.getresponse() 100 data = resp.read() 101 if resp.status not in (200,): 102 errmsg = "Failed to retrieve state: {} {} - {}".format( 103 resp.status, resp.reason, data) 104 raise RuntimeError(errmsg) 105 return json.loads(data) 106 107 def _get_machines(self): 108 headers = {'Content-Type': 'application/json'} 109 url = '/v1-alpha/machines' 110 self.conn.request('GET', url, headers=headers) 111 resp = self.conn.getresponse() 112 data = resp.read() 113 if resp.status not in (200,): 114 errmsg = "Failed to retrieve machines: {} {} - {}".format( 115 resp.status, resp.reason, data) 116 raise RuntimeError(errmsg) 117 return json.loads(data) 118 119 # container api 120 121 def create(self, name, image, command='', template=None, **kwargs): 122 """Create a container""" 123 self._create_container(name, image, command, 124 template or copy.deepcopy(CONTAINER_TEMPLATE), **kwargs) 125 126 def _create_container(self, name, image, command, unit, **kwargs): 127 l = locals().copy() 128 l.update(re.match(MATCH, name).groupdict()) 129 # prepare memory limit for the container type 130 mem = kwargs.get('memory', {}).get(l['c_type'], None) 131 if mem: 132 l.update({'memory': '-m {}'.format(mem.lower())}) 133 else: 134 l.update({'memory': ''}) 135 # prepare memory limit for the container type 136 cpu = kwargs.get('cpu', {}).get(l['c_type'], None) 137 if cpu: 138 l.update({'cpu': '-c {}'.format(cpu)}) 139 else: 140 l.update({'cpu': ''}) 141 # set unit hostname 142 l.update({'hostname': self._get_hostname(name)}) 143 # should a special entrypoint be used 144 entrypoint = kwargs.get('entrypoint') 145 if entrypoint: 146 l.update({'entrypoint': '{}'.format(entrypoint)}) 147 # encode command as utf-8 148 if isinstance(l.get('command'), basestring): 149 l['command'] = l['command'].encode('utf-8') 150 # construct unit from template 151 for f in unit: 152 f['value'] = f['value'].format(**l) 153 # prepare tags only if one was provided 154 tags = kwargs.get('tags', {}) 155 if tags: 156 tagset = ' '.join(['"{}={}"'.format(k, v) for k, v in tags.items()]) 157 unit.append({"section": "X-Fleet", "name": "MachineMetadata", 158 "value": tagset}) 159 # post unit to fleet 160 self._put_unit(name, {"desiredState": "loaded", "options": unit}) 161 162 def _get_hostname(self, application_name): 163 hostname = settings.UNIT_HOSTNAME 164 if hostname == "default": 165 return '' 166 elif hostname == "application": 167 # replace underscore with dots, since underscore is not valid in DNS hostnames 168 dns_name = application_name.replace("_", ".") 169 return '-h ' + dns_name 170 elif hostname == "server": 171 return '-h %H' 172 else: 173 raise RuntimeError('Unsupported hostname: ' + hostname) 174 175 def start(self, name): 176 """Start a container""" 177 self._put_unit(name, {'desiredState': 'launched'}) 178 self._wait_for_container_running(name) 179 180 def _wait_for_container_state(self, name): 181 # wait for container to get scheduled 182 for _ in range(30): 183 states = self._get_state(name) 184 if states and len(states.get('states', [])) == 1: 185 return states.get('states')[0] 186 time.sleep(1) 187 else: 188 raise RuntimeError('container timeout while retrieving state') 189 190 def _wait_for_container_running(self, name): 191 # we bump to 20 minutes here to match the timeout on the router and in the app unit files 192 for _ in range(1200): 193 if self.state(name) == JobState.up: 194 return 195 time.sleep(1) 196 else: 197 raise RuntimeError('container failed to start') 198 199 def _wait_for_destroy(self, name): 200 for _ in range(30): 201 if not self._get_state(name): 202 break 203 time.sleep(1) 204 else: 205 raise RuntimeError('timeout on container destroy') 206 207 def stop(self, name): 208 """Stop a container""" 209 raise NotImplementedError 210 211 def destroy(self, name): 212 """Destroy a container""" 213 # call all destroy functions, ignoring any errors 214 try: 215 self._destroy_container(name) 216 except: 217 pass 218 self._wait_for_destroy(name) 219 220 def _destroy_container(self, name): 221 for attempt in range(RETRIES): 222 try: 223 self._delete_unit(name) 224 break 225 except: 226 if attempt == (RETRIES - 1): # account for 0 indexing 227 raise 228 229 def run(self, name, image, entrypoint, command): # noqa 230 """Run a one-off command""" 231 self._create_container(name, image, command, copy.deepcopy(RUN_TEMPLATE), 232 entrypoint=entrypoint) 233 # launch the container 234 self._put_unit(name, {'desiredState': 'launched'}) 235 # wait for the container to get scheduled 236 state = self._wait_for_container_state(name) 237 238 try: 239 machineID = state.get('machineID') 240 241 # find the machine 242 machines = self._get_machines() 243 if not machines: 244 raise RuntimeError('no available hosts to run command') 245 246 # find the machine's primaryIP 247 primaryIP = None 248 for m in machines.get('machines', []): 249 if m['id'] == machineID: 250 primaryIP = m['primaryIP'] 251 if not primaryIP: 252 raise RuntimeError('could not find host') 253 254 # prepare ssh key 255 file_obj = cStringIO.StringIO(base64.b64decode(self.pkey)) 256 pkey = paramiko.RSAKey(file_obj=file_obj) 257 258 # grab output via docker logs over SSH 259 ssh = paramiko.SSHClient() 260 ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) 261 ssh.connect(primaryIP, username="core", pkey=pkey) 262 # share a transport 263 tran = ssh.get_transport() 264 265 def _do_ssh(cmd): 266 chan = tran.open_session() 267 # get a pty so stdout/stderr look right 268 chan.get_pty() 269 out = chan.makefile() 270 chan.exec_command(cmd) 271 rc, output = chan.recv_exit_status(), out.read() 272 return rc, output 273 274 # wait for container to launch 275 # we loop indefinitely here, as we have no idea how long the docker pull will take 276 while True: 277 rc, _ = _do_ssh('docker inspect {name}'.format(**locals())) 278 if rc == 0: 279 break 280 time.sleep(1) 281 else: 282 raise RuntimeError('failed to create container') 283 284 # wait for container to start 285 for _ in range(2): 286 _rc, _output = _do_ssh('docker inspect {name}'.format(**locals())) 287 if _rc != 0: 288 raise RuntimeError('failed to inspect container') 289 _container = json.loads(_output) 290 started_at = _container[0]["State"]["StartedAt"] 291 if not started_at.startswith('0001'): 292 break 293 time.sleep(1) 294 else: 295 raise RuntimeError('container failed to start') 296 297 # wait for container to complete 298 for _ in range(1200): 299 _rc, _output = _do_ssh('docker inspect {name}'.format(**locals())) 300 if _rc != 0: 301 raise RuntimeError('failed to inspect container') 302 _container = json.loads(_output) 303 finished_at = _container[0]["State"]["FinishedAt"] 304 if not finished_at.startswith('0001'): 305 break 306 time.sleep(1) 307 else: 308 raise RuntimeError('container timed out') 309 310 # gather container output 311 _rc, output = _do_ssh('docker logs {name}'.format(**locals())) 312 if _rc != 0: 313 raise RuntimeError('could not attach to container') 314 315 # determine container exit code 316 _rc, _output = _do_ssh('docker inspect {name}'.format(**locals())) 317 if _rc != 0: 318 raise RuntimeError('could not determine exit code') 319 container = json.loads(_output) 320 rc = container[0]["State"]["ExitCode"] 321 322 finally: 323 # cleanup 324 self._destroy_container(name) 325 self._wait_for_destroy(name) 326 327 # return rc and output 328 return rc, output 329 330 def state(self, name): 331 systemdActiveStateMap = { 332 "active": "up", 333 "reloading": "down", 334 "inactive": "created", 335 "failed": "crashed", 336 "activating": "down", 337 "deactivating": "down", 338 } 339 try: 340 # NOTE (bacongobbler): this call to ._get_unit() also acts as a pre-emptive check to 341 # determine if the job no longer exists (will raise a RuntimeError on 404) 342 unit = self._get_unit(name) 343 state = self._wait_for_container_state(name) 344 activeState = state['systemdActiveState'] 345 # FIXME (bacongobbler): when fleet loads a job, sometimes it'll automatically start and 346 # stop the container, which in our case will return as 'failed', even though 347 # the container is perfectly fine. 348 if activeState == 'failed': 349 if json.loads(unit)['currentState'] == 'loaded': 350 return JobState.created 351 return getattr(JobState, systemdActiveStateMap[activeState]) 352 except KeyError: 353 # failed retrieving a proper response from the fleet API 354 return JobState.error 355 except RuntimeError: 356 # failed to retrieve a response from the fleet API, 357 # which means it does not exist 358 return JobState.destroyed 359 360 def attach(self, name): 361 """ 362 Attach to a job's stdin, stdout and stderr 363 """ 364 raise NotImplementedError 365 366 SchedulerClient = FleetHTTPClient 367 368 369 CONTAINER_TEMPLATE = [ 370 {"section": "Unit", "name": "Description", "value": "{name}"}, 371 {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; docker pull $IMAGE"'''}, # noqa 372 {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "docker inspect {name} >/dev/null 2>&1 && docker rm -f {name} || true"'''}, # noqa 373 {"section": "Service", "name": "ExecStart", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; port=$(docker inspect -f '{{{{range $k, $v := .ContainerConfig.ExposedPorts }}}}{{{{$k}}}}{{{{end}}}}' $IMAGE | cut -d/ -f1) ; docker run --name {name} {memory} {cpu} {hostname} -P -e PORT=$port $IMAGE {command}"'''}, # noqa 374 {"section": "Service", "name": "ExecStop", "value": '''/usr/bin/docker stop {name}'''}, 375 {"section": "Service", "name": "ExecStop", "value": '''/usr/bin/docker rm -f {name}'''}, 376 {"section": "Service", "name": "TimeoutStartSec", "value": "20m"}, 377 {"section": "Service", "name": "TimeoutStopSec", "value": "10"}, 378 {"section": "Service", "name": "RestartSec", "value": "5"}, 379 {"section": "Service", "name": "Restart", "value": "on-failure"}, 380 ] 381 382 383 RUN_TEMPLATE = [ 384 {"section": "Unit", "name": "Description", "value": "{name} admin command"}, 385 {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; docker pull $IMAGE"'''}, # noqa 386 {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "docker inspect {name} >/dev/null 2>&1 && docker rm -f {name} || true"'''}, # noqa 387 {"section": "Service", "name": "ExecStart", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; docker run --name {name} --entrypoint={entrypoint} -a stdout -a stderr $IMAGE {command}"'''}, # noqa 388 {"section": "Service", "name": "TimeoutStartSec", "value": "20m"}, 389 ]