github.com/inflatablewoman/deis@v1.0.1-0.20141111034523-a4511c46a6ce/controller/scheduler/fleet.py (about) 1 import cStringIO 2 import base64 3 import copy 4 import json 5 import httplib 6 import paramiko 7 import socket 8 import re 9 import time 10 11 12 MATCH = re.compile( 13 '(?P<app>[a-z0-9-]+)_?(?P<version>v[0-9]+)?\.?(?P<c_type>[a-z-_]+)?.(?P<c_num>[0-9]+)') 14 RETRIES = 3 15 16 17 class UHTTPConnection(httplib.HTTPConnection): 18 """Subclass of Python library HTTPConnection that uses a Unix domain socket. 19 """ 20 21 def __init__(self, path): 22 httplib.HTTPConnection.__init__(self, 'localhost') 23 self.path = path 24 25 def connect(self): 26 sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) 27 sock.connect(self.path) 28 self.sock = sock 29 30 31 class FleetHTTPClient(object): 32 33 def __init__(self, target, auth, options, pkey): 34 self.target = target 35 self.auth = auth 36 self.options = options 37 self.pkey = pkey 38 # single global connection 39 self.conn = UHTTPConnection(self.target) 40 41 # connection helpers 42 43 def _put_unit(self, name, body): 44 headers = {'Content-Type': 'application/json'} 45 self.conn.request('PUT', '/v1-alpha/units/{name}.service'.format(**locals()), 46 headers=headers, body=json.dumps(body)) 47 resp = self.conn.getresponse() 48 data = resp.read() 49 if not 200 <= resp.status <= 299: 50 errmsg = "Failed to create unit: {} {} - {}".format( 51 resp.status, resp.reason, data) 52 raise RuntimeError(errmsg) 53 return data 54 55 def _delete_unit(self, name): 56 headers = {'Content-Type': 'application/json'} 57 self.conn.request('DELETE', '/v1-alpha/units/{name}.service'.format(**locals()), 58 headers=headers) 59 resp = self.conn.getresponse() 60 data = resp.read() 61 if resp.status not in (404, 204): 62 errmsg = "Failed to delete unit: {} {} - {}".format( 63 resp.status, resp.reason, data) 64 raise RuntimeError(errmsg) 65 return data 66 67 def _get_state(self, name=None): 68 headers = {'Content-Type': 'application/json'} 69 url = '/v1-alpha/state' 70 if name: 71 url += '?unitName={name}.service'.format(**locals()) 72 self.conn.request('GET', url, headers=headers) 73 resp = self.conn.getresponse() 74 data = resp.read() 75 if resp.status not in (200,): 76 errmsg = "Failed to retrieve state: {} {} - {}".format( 77 resp.status, resp.reason, data) 78 raise RuntimeError(errmsg) 79 return json.loads(data) 80 81 def _get_machines(self): 82 headers = {'Content-Type': 'application/json'} 83 url = '/v1-alpha/machines' 84 self.conn.request('GET', url, headers=headers) 85 resp = self.conn.getresponse() 86 data = resp.read() 87 if resp.status not in (200,): 88 errmsg = "Failed to retrieve machines: {} {} - {}".format( 89 resp.status, resp.reason, data) 90 raise RuntimeError(errmsg) 91 return json.loads(data) 92 93 # container api 94 95 def create(self, name, image, command='', template=None, **kwargs): 96 """Create a container""" 97 self._create_container(name, image, command, 98 template or copy.deepcopy(CONTAINER_TEMPLATE), **kwargs) 99 100 def _create_container(self, name, image, command, unit, **kwargs): 101 l = locals().copy() 102 l.update(re.match(MATCH, name).groupdict()) 103 # prepare memory limit for the container type 104 mem = kwargs.get('memory', {}).get(l['c_type'], None) 105 if mem: 106 l.update({'memory': '-m {}'.format(mem.lower())}) 107 else: 108 l.update({'memory': ''}) 109 # prepare memory limit for the container type 110 cpu = kwargs.get('cpu', {}).get(l['c_type'], None) 111 if cpu: 112 l.update({'cpu': '-c {}'.format(cpu)}) 113 else: 114 l.update({'cpu': ''}) 115 # should a special entrypoint be used 116 entrypoint = kwargs.get('entrypoint') 117 if entrypoint: 118 l.update({'entrypoint': '{}'.format(entrypoint)}) 119 # construct unit from template 120 for f in unit: 121 f['value'] = f['value'].format(**l) 122 # prepare tags only if one was provided 123 tags = kwargs.get('tags', {}) 124 if tags: 125 tagset = ' '.join(['"{}={}"'.format(k, v) for k, v in tags.items()]) 126 unit.append({"section": "X-Fleet", "name": "MachineMetadata", 127 "value": tagset}) 128 # post unit to fleet and retry 129 for attempt in range(RETRIES): 130 try: 131 self._put_unit(name, {"desiredState": "launched", "options": unit}) 132 break 133 except: 134 if attempt == (RETRIES - 1): # account for 0 indexing 135 raise 136 137 def start(self, name): 138 """Start a container""" 139 self._wait_for_container(name) 140 141 def _wait_for_container(self, name): 142 failures = 0 143 # we bump to 20 minutes here to match the timeout on the router and in the app unit files 144 for _ in range(1200): 145 states = self._get_state(name) 146 if states and len(states.get('states', [])) == 1: 147 state = states.get('states')[0] 148 subState = state.get('systemdSubState') 149 if subState == 'running' or subState == 'exited': 150 break 151 elif subState == 'failed': 152 # FIXME: fleet unit state reports failed when containers are fine 153 failures += 1 154 if failures == 10: 155 raise RuntimeError('container failed to start') 156 time.sleep(1) 157 else: 158 raise RuntimeError('container timeout on start') 159 160 def _wait_for_destroy(self, name): 161 for _ in range(30): 162 states = self._get_state(name) 163 if not states: 164 break 165 time.sleep(1) 166 else: 167 raise RuntimeError('timeout on container destroy') 168 169 def stop(self, name): 170 """Stop a container""" 171 raise NotImplementedError 172 173 def destroy(self, name): 174 """Destroy a container""" 175 # call all destroy functions, ignoring any errors 176 try: 177 self._destroy_container(name) 178 except: 179 pass 180 self._wait_for_destroy(name) 181 182 def _destroy_container(self, name): 183 for attempt in range(RETRIES): 184 try: 185 self._delete_unit(name) 186 break 187 except: 188 if attempt == (RETRIES - 1): # account for 0 indexing 189 raise 190 191 def run(self, name, image, entrypoint, command): # noqa 192 """Run a one-off command""" 193 self._create_container(name, image, command, copy.deepcopy(RUN_TEMPLATE), 194 entrypoint=entrypoint) 195 196 # wait for the container to get scheduled 197 for _ in range(30): 198 states = self._get_state(name) 199 if states and len(states.get('states', [])) == 1: 200 state = states.get('states')[0] 201 break 202 time.sleep(1) 203 else: 204 raise RuntimeError('container did not report state') 205 machineID = state.get('machineID') 206 207 # find the machine 208 machines = self._get_machines() 209 if not machines: 210 raise RuntimeError('no available hosts to run command') 211 212 # find the machine's primaryIP 213 primaryIP = None 214 for m in machines.get('machines', []): 215 if m['id'] == machineID: 216 primaryIP = m['primaryIP'] 217 if not primaryIP: 218 raise RuntimeError('could not find host') 219 220 # prepare ssh key 221 file_obj = cStringIO.StringIO(base64.b64decode(self.pkey)) 222 pkey = paramiko.RSAKey(file_obj=file_obj) 223 224 # grab output via docker logs over SSH 225 ssh = paramiko.SSHClient() 226 ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) 227 ssh.connect(primaryIP, username="core", pkey=pkey) 228 # share a transport 229 tran = ssh.get_transport() 230 231 def _do_ssh(cmd): 232 chan = tran.open_session() 233 # get a pty so stdout/stderr look right 234 chan.get_pty() 235 out = chan.makefile() 236 chan.exec_command(cmd) 237 rc, output = chan.recv_exit_status(), out.read() 238 return rc, output 239 240 # wait for container to start 241 for _ in range(1200): 242 rc, _ = _do_ssh('docker inspect {name}'.format(**locals())) 243 if rc == 0: 244 break 245 time.sleep(1) 246 else: 247 raise RuntimeError('container failed to start on host') 248 249 # wait for container to complete 250 for _ in range(1200): 251 _rc, _output = _do_ssh('docker inspect {name}'.format(**locals())) 252 if _rc != 0: 253 raise RuntimeError('failed to inspect container') 254 _container = json.loads(_output) 255 finished_at = _container[0]["State"]["FinishedAt"] 256 if not finished_at.startswith('0001'): 257 break 258 time.sleep(1) 259 else: 260 raise RuntimeError('container timed out') 261 262 # gather container output 263 _rc, output = _do_ssh('docker logs {name}'.format(**locals())) 264 if _rc != 0: 265 raise RuntimeError('could not attach to container') 266 267 # determine container exit code 268 _rc, _output = _do_ssh('docker inspect {name}'.format(**locals())) 269 if _rc != 0: 270 raise RuntimeError('could not determine exit code') 271 container = json.loads(_output) 272 rc = container[0]["State"]["ExitCode"] 273 274 # cleanup 275 self._destroy_container(name) 276 self._wait_for_destroy(name) 277 278 # return rc and output 279 return rc, output 280 281 def attach(self, name): 282 """ 283 Attach to a job's stdin, stdout and stderr 284 """ 285 raise NotImplementedError 286 287 SchedulerClient = FleetHTTPClient 288 289 290 CONTAINER_TEMPLATE = [ 291 {"section": "Unit", "name": "Description", "value": "{name}"}, 292 {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; docker pull $IMAGE"'''}, # noqa 293 {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "docker inspect {name} >/dev/null 2>&1 && docker rm -f {name} || true"'''}, # noqa 294 {"section": "Service", "name": "ExecStart", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; port=$(docker inspect -f '{{{{range $k, $v := .ContainerConfig.ExposedPorts }}}}{{{{$k}}}}{{{{end}}}}' $IMAGE | cut -d/ -f1) ; docker run --name {name} {memory} {cpu} -P -e PORT=$port $IMAGE {command}"'''}, # noqa 295 {"section": "Service", "name": "ExecStop", "value": '''/usr/bin/docker rm -f {name}'''}, 296 {"section": "Service", "name": "TimeoutStartSec", "value": "20m"}, 297 {"section": "Service", "name": "RestartSec", "value": "5"}, 298 {"section": "Service", "name": "Restart", "value": "on-failure"}, 299 ] 300 301 302 RUN_TEMPLATE = [ 303 {"section": "Unit", "name": "Description", "value": "{name} admin command"}, 304 {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; docker pull $IMAGE"'''}, # noqa 305 {"section": "Service", "name": "ExecStartPre", "value": '''/bin/sh -c "docker inspect {name} >/dev/null 2>&1 && docker rm -f {name} || true"'''}, # noqa 306 {"section": "Service", "name": "ExecStart", "value": '''/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; docker run --name {name} --entrypoint={entrypoint} -a stdout -a stderr $IMAGE {command}"'''}, # noqa 307 {"section": "Service", "name": "TimeoutStartSec", "value": "20m"}, 308 ]