github.com/chasestarr/deis@v1.13.5-0.20170519182049-1d9e59fbdbfc/controller/scheduler/fleet.py (about) 1 import base64 2 import copy 3 import cStringIO 4 import httplib 5 import json 6 import paramiko 7 import re 8 import socket 9 import time 10 11 from django.conf import settings 12 13 from . import AbstractSchedulerClient 14 from .states import JobState 15 16 17 MATCH = re.compile( 18 '(?P<app>[a-z0-9-]+)_?(?P<version>v[0-9]+)?\.?(?P<c_type>[a-z-_]+)?.(?P<c_num>[0-9]+)') 19 RETRIES = 3 20 21 22 class UHTTPConnection(httplib.HTTPConnection): 23 """Subclass of Python library HTTPConnection that uses a Unix domain socket. 24 """ 25 26 def __init__(self, path): 27 httplib.HTTPConnection.__init__(self, 'localhost') 28 self.path = path 29 30 def connect(self): 31 sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) 32 sock.connect(self.path) 33 self.sock = sock 34 35 36 class FleetHTTPClient(AbstractSchedulerClient): 37 38 def __init__(self, target, auth, options, pkey): 39 super(FleetHTTPClient, self).__init__(target, auth, options, pkey) 40 # single global connection 41 self.conn = UHTTPConnection(self.target) 42 43 # connection helpers 44 45 def _request_unit(self, method, name, body=None): 46 headers = {'Content-Type': 'application/json'} 47 self.conn.request(method, '/v1-alpha/units/{name}.service'.format(**locals()), 48 headers=headers, body=json.dumps(body)) 49 return self.conn.getresponse() 50 51 def _get_unit(self, name): 52 for attempt in xrange(RETRIES): 53 try: 54 resp = self._request_unit('GET', name) 55 data = resp.read() 56 if not 200 <= resp.status <= 299: 57 errmsg = "Failed to retrieve unit: {} {} - {}".format( 58 resp.status, resp.reason, data) 59 raise RuntimeError(errmsg) 60 return data 61 except: 62 if attempt >= (RETRIES - 1): 63 raise 64 65 def _put_unit(self, name, body): 66 for attempt in xrange(RETRIES): 67 try: 68 resp = self._request_unit('PUT', name, body) 69 data = resp.read() 70 if not 200 <= resp.status <= 299: 71 errmsg = "Failed to create unit: {} {} - {}".format( 72 resp.status, resp.reason, data) 73 raise RuntimeError(errmsg) 74 return data 75 except: 76 if attempt >= (RETRIES - 1): 77 raise 78 79 def _delete_unit(self, name): 80 headers = {'Content-Type': 'application/json'} 81 self.conn.request('DELETE', '/v1-alpha/units/{name}.service'.format(**locals()), 82 headers=headers) 83 resp = self.conn.getresponse() 84 data = resp.read() 85 if resp.status not in (404, 204): 86 errmsg = "Failed to delete unit: {} {} - {}".format( 87 resp.status, resp.reason, data) 88 raise RuntimeError(errmsg) 89 return data 90 91 def _get_state(self, name=None): 92 headers = {'Content-Type': 'application/json'} 93 url = '/v1-alpha/state' 94 if name: 95 url += '?unitName={name}.service'.format(**locals()) 96 self.conn.request('GET', url, headers=headers) 97 resp = self.conn.getresponse() 98 data = resp.read() 99 if resp.status not in (200,): 100 errmsg = "Failed to retrieve state: {} {} - {}".format( 101 resp.status, resp.reason, data) 102 raise RuntimeError(errmsg) 103 return json.loads(data) 104 105 def _get_machines(self): 106 headers = {'Content-Type': 'application/json'} 107 url = '/v1-alpha/machines' 108 self.conn.request('GET', url, headers=headers) 109 resp = self.conn.getresponse() 110 data = resp.read() 111 if resp.status not in (200,): 112 errmsg = "Failed to retrieve machines: {} {} - {}".format( 113 resp.status, resp.reason, data) 114 raise RuntimeError(errmsg) 115 return json.loads(data) 116 117 # container api 118 119 def create(self, name, image, command='', template=None, **kwargs): 120 """Create a container.""" 121 self._create_container(name, image, command, 122 template or copy.deepcopy(CONTAINER_TEMPLATE), **kwargs) 123 124 def _create_container(self, name, image, command, unit, **kwargs): 125 l = locals().copy() 126 l.update(re.match(MATCH, name).groupdict()) 127 # prepare memory limit for the container type 128 mem = kwargs.get('memory', {}).get(l['c_type'], None) 129 if mem: 130 l.update({'memory': '-m {} {}'.format(mem.lower(), settings.DISABLE_SWAP)}) 131 else: 132 l.update({'memory': ''}) 133 # prepare memory limit for the container type 134 cpu = kwargs.get('cpu', {}).get(l['c_type'], None) 135 if cpu: 136 l.update({'cpu': '-c {}'.format(cpu)}) 137 else: 138 l.update({'cpu': ''}) 139 # set unit hostname 140 l.update({'hostname': self._get_hostname(name)}) 141 # should a special entrypoint be used 142 entrypoint = kwargs.get('entrypoint') 143 if entrypoint: 144 l.update({'entrypoint': '{}'.format(entrypoint)}) 145 # encode command as utf-8 146 if isinstance(l.get('command'), basestring): 147 l['command'] = l['command'].encode('utf-8') 148 # construct unit from template 149 for f in unit: 150 f['value'] = f['value'].format(**l) 151 # prepare tags only if one was provided 152 tags = kwargs.get('tags', {}) 153 unit_tags = tags.viewitems() 154 if settings.ENABLE_PLACEMENT_OPTIONS in ['true', 'True', 'TRUE', '1']: 155 tags['dataPlane'] = 'true' 156 if unit_tags: 157 tagset = ' '.join(['"{}={}"'.format(k, v) for k, v in unit_tags]) 158 unit.append({"section": "X-Fleet", "name": "MachineMetadata", 159 "value": tagset}) 160 # post unit to fleet 161 self._put_unit(name, {"desiredState": "loaded", "options": unit}) 162 163 def _get_hostname(self, application_name): 164 hostname = settings.UNIT_HOSTNAME 165 if hostname == "default": 166 return '' 167 elif hostname == "application": 168 # replace underscore with dots, since underscore is not valid in DNS hostnames 169 dns_name = application_name.replace("_", ".") 170 return '-h ' + dns_name 171 elif hostname == "server": 172 return '-h %H' 173 else: 174 raise RuntimeError('Unsupported hostname: ' + hostname) 175 176 def start(self, name): 177 """Start a container.""" 178 self._put_unit(name, {'desiredState': 'launched'}) 179 self._wait_for_container_running(name) 180 181 def _wait_for_container_state(self, name): 182 # wait for container to get scheduled 183 for _ in xrange(30): 184 states = self._get_state(name) 185 if states and len(states.get('states', [])) == 1: 186 return states.get('states')[0] 187 time.sleep(1) 188 else: 189 raise RuntimeError('container timeout while retrieving state') 190 191 def _wait_for_container_running(self, name): 192 # we bump to 20 minutes here to match the timeout on the router and in the app unit files 193 try: 194 self._wait_for_job_state(name, JobState.up) 195 except RuntimeError: 196 raise RuntimeError('container failed to start') 197 198 def _wait_for_job_state(self, name, state): 199 # we bump to 20 minutes here to match the timeout on the router and in the app unit files 200 for _ in xrange(1200): 201 if self.state(name) == state: 202 return 203 time.sleep(1) 204 else: 205 raise RuntimeError('timeout waiting for job state: {}'.format(state)) 206 207 def _wait_for_destroy(self, name): 208 for _ in xrange(30): 209 if not self._get_state(name): 210 break 211 time.sleep(1) 212 else: 213 raise RuntimeError('timeout on container destroy') 214 215 def stop(self, name): 216 """Stop a container.""" 217 self._put_unit(name, {"desiredState": "loaded"}) 218 self._wait_for_job_state(name, JobState.created) 219 220 def destroy(self, name): 221 """Destroy a container.""" 222 # call all destroy functions, ignoring any errors 223 try: 224 self._destroy_container(name) 225 except: 226 pass 227 self._wait_for_destroy(name) 228 229 def _destroy_container(self, name): 230 for attempt in xrange(RETRIES): 231 try: 232 self._delete_unit(name) 233 break 234 except: 235 if attempt == (RETRIES - 1): # account for 0 indexing 236 raise 237 238 def run(self, name, image, entrypoint, command): # noqa 239 """Run a one-off command.""" 240 self._create_container(name, image, command, copy.deepcopy(RUN_TEMPLATE), 241 entrypoint=entrypoint) 242 # launch the container 243 self._put_unit(name, {'desiredState': 'launched'}) 244 # wait for the container to get scheduled 245 state = self._wait_for_container_state(name) 246 247 try: 248 machineID = state.get('machineID') 249 250 # find the machine 251 machines = self._get_machines() 252 if not machines: 253 raise RuntimeError('no available hosts to run command') 254 255 # find the machine's primaryIP 256 primaryIP = None 257 for m in machines.get('machines', []): 258 if m['id'] == machineID: 259 primaryIP = m['primaryIP'] 260 if not primaryIP: 261 raise RuntimeError('could not find host') 262 263 # prepare ssh key 264 file_obj = cStringIO.StringIO(base64.b64decode(self.pkey)) 265 pkey = paramiko.RSAKey(file_obj=file_obj) 266 267 # grab output via docker logs over SSH 268 ssh = paramiko.SSHClient() 269 ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) 270 ssh.connect(primaryIP, username="core", pkey=pkey) 271 # share a transport 272 tran = ssh.get_transport() 273 274 def _do_ssh(cmd): 275 with tran.open_session() as chan: 276 chan.exec_command(cmd) 277 while not chan.exit_status_ready(): 278 time.sleep(1) 279 out = chan.makefile() 280 output = out.read() 281 rc = chan.recv_exit_status() 282 return rc, output 283 284 # wait for container to launch 285 # we loop indefinitely here, as we have no idea how long the docker pull will take 286 while True: 287 rc, _ = _do_ssh('docker inspect {name}'.format(**locals())) 288 if rc == 0: 289 break 290 time.sleep(1) 291 else: 292 raise RuntimeError('failed to create container') 293 294 # wait for container to start 295 for _ in xrange(2): 296 _rc, _output = _do_ssh('docker inspect {name}'.format(**locals())) 297 if _rc != 0: 298 raise RuntimeError('failed to inspect container') 299 _container = json.loads(_output) 300 started_at = _container[0]["State"]["StartedAt"] 301 if not started_at.startswith('0001'): 302 break 303 time.sleep(1) 304 else: 305 raise RuntimeError('container failed to start') 306 307 # wait for container to complete 308 for _ in xrange(1200): 309 _rc, _output = _do_ssh('docker inspect {name}'.format(**locals())) 310 if _rc != 0: 311 raise RuntimeError('failed to inspect container') 312 _container = json.loads(_output) 313 finished_at = _container[0]["State"]["FinishedAt"] 314 if not finished_at.startswith('0001'): 315 break 316 time.sleep(1) 317 else: 318 raise RuntimeError('container timed out') 319 320 # gather container output 321 _rc, output = _do_ssh('docker logs {name}'.format(**locals())) 322 if _rc != 0: 323 raise RuntimeError('could not attach to container') 324 325 # determine container exit code 326 _rc, _output = _do_ssh('docker inspect {name}'.format(**locals())) 327 if _rc != 0: 328 raise RuntimeError('could not determine exit code') 329 container = json.loads(_output) 330 rc = container[0]["State"]["ExitCode"] 331 332 finally: 333 # cleanup 334 self._destroy_container(name) 335 self._wait_for_destroy(name) 336 337 # return rc and output 338 return rc, output 339 340 def state(self, name): 341 """Display the given job's running state.""" 342 systemdActiveStateMap = { 343 'active': 'up', 344 'reloading': 'down', 345 'inactive': 'created', 346 'failed': 'crashed', 347 'activating': 'down', 348 'deactivating': 'down', 349 } 350 try: 351 # NOTE (bacongobbler): this call to ._get_unit() acts as a pre-emptive check to 352 # determine if the job no longer exists (will raise a RuntimeError on 404) 353 self._get_unit(name) 354 state = self._wait_for_container_state(name) 355 activeState = state['systemdActiveState'] 356 # FIXME (bacongobbler): when fleet loads a job, sometimes it'll automatically start and 357 # stop the container, which in our case will return as 'failed', even though 358 # the container is perfectly fine. 359 if activeState == 'failed' and state['systemdLoadState'] == 'loaded': 360 return JobState.created 361 return getattr(JobState, systemdActiveStateMap[activeState]) 362 except KeyError: 363 # failed retrieving a proper response from the fleet API 364 return JobState.error 365 except RuntimeError: 366 # failed to retrieve a response from the fleet API, 367 # which means it does not exist 368 return JobState.destroyed 369 370 SchedulerClient = FleetHTTPClient 371 372 373 CONTAINER_TEMPLATE = [ 374 {"section": "Unit", "name": "Description", "value": "{name}"}, 375 {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; docker pull $IMAGE"'''}, # noqa 376 {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "docker inspect {name} >/dev/null 2>&1 && docker rm -f {name} || true"'''}, # noqa 377 {"section": "Service", "name": "ExecStart", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; docker run --name {name} --rm {memory} {cpu} {hostname} -P $IMAGE {command}"'''}, # noqa 378 {"section": "Service", "name": "ExecStop", "value": '''/usr/bin/docker stop {name}'''}, 379 {"section": "Service", "name": "TimeoutStartSec", "value": "20m"}, 380 {"section": "Service", "name": "TimeoutStopSec", "value": "10"}, 381 {"section": "Service", "name": "RestartSec", "value": "5"}, 382 {"section": "Service", "name": "Restart", "value": "on-failure"}, 383 ] 384 385 386 RUN_TEMPLATE = [ 387 {"section": "Unit", "name": "Description", "value": "{name} admin command"}, 388 {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; docker pull $IMAGE"'''}, # noqa 389 {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "docker inspect {name} >/dev/null 2>&1 && docker rm -f {name} || true"'''}, # noqa 390 {"section": "Service", "name": "ExecStart", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; docker run --name {name} --entrypoint={entrypoint} -a stdout -a stderr $IMAGE {command}"'''}, # noqa 391 {"section": "Service", "name": "TimeoutStartSec", "value": "20m"}, 392 ]