github.com/blystad/deis@v0.11.0/controller/scheduler/coreos.py (about) 1 from cStringIO import StringIO 2 import base64 3 import os 4 import random 5 import re 6 import subprocess 7 import time 8 9 10 ROOT_DIR = os.path.join(os.getcwd(), 'coreos') 11 if not os.path.exists(ROOT_DIR): 12 os.mkdir(ROOT_DIR) 13 14 MATCH = re.compile( 15 '(?P<app>[a-z0-9-]+)_?(?P<version>v[0-9]+)?\.?(?P<c_type>[a-z]+)?.(?P<c_num>[0-9]+)') 16 17 18 class FleetClient(object): 19 20 def __init__(self, cluster_name, hosts, auth, domain, options): 21 self.name = cluster_name 22 self.hosts = hosts 23 self.domain = domain 24 self.options = options 25 self.auth = auth 26 self.auth_path = os.path.join(ROOT_DIR, 'ssh-{cluster_name}'.format(**locals())) 27 with open(self.auth_path, 'w') as f: 28 f.write(base64.b64decode(auth)) 29 os.chmod(self.auth_path, 0600) 30 31 self.env = { 32 'PATH': '/usr/local/bin:/usr/bin:/bin:{}'.format( 33 os.path.abspath(os.path.join(__file__, '..'))), 34 'FLEETW_KEY': self.auth_path, 35 'FLEETW_HOST': random.choice(self.hosts.split(','))} 36 37 # scheduler setup / teardown 38 39 def setUp(self): 40 """ 41 Setup a CoreOS cluster including router and log aggregator 42 """ 43 return 44 45 def tearDown(self): 46 """ 47 Tear down a CoreOS cluster including router and log aggregator 48 """ 49 return 50 51 # announcer helpers 52 53 def _log_skipped_announcer(self, action, name): 54 """ 55 Logs a message stating that this operation doesn't require an announcer 56 """ 57 print "-- skipping announcer {} for {}".format(action, name) 58 59 # job api 60 61 def create(self, name, image, command='', template=None, use_announcer=True, **kwargs): 62 """ 63 Create a new job 64 """ 65 print 'Creating {name}'.format(**locals()) 66 env = self.env.copy() 67 self._create_container(name, image, command, template or CONTAINER_TEMPLATE, env, **kwargs) 68 self._create_log(name, image, command, LOG_TEMPLATE, env) 69 70 if use_announcer: 71 self._create_announcer(name, image, command, ANNOUNCE_TEMPLATE, env) 72 else: 73 self._log_skipped_announcer('create', name) 74 75 def _create_container(self, name, image, command, template, env, **kwargs): 76 l = locals().copy() 77 l.update(re.match(MATCH, name).groupdict()) 78 # prepare memory limit for the container type 79 mem = kwargs.get('memory', {}).get(l['c_type'], None) 80 if mem: 81 l.update({'memory': '-m {}'.format(mem.lower())}) 82 else: 83 l.update({'memory': ''}) 84 # prepare memory limit for the container type 85 cpu = kwargs.get('cpu', {}).get(l['c_type'], None) 86 if cpu: 87 l.update({'cpu': '-c {}'.format(cpu)}) 88 else: 89 l.update({'cpu': ''}) 90 env.update({'FLEETW_UNIT': name + '.service'}) 91 # construct unit from template 92 unit = template.format(**l) 93 # prepare tags only if one was provided 94 tags = kwargs.get('tags', {}) 95 if tags: 96 tagset = ' '.join(['"{}={}"'.format(k, v) for k, v in tags.items()]) 97 unit = unit + '\n[X-Fleet]\nX-ConditionMachineMetadata={}\n'.format(tagset) 98 env.update({'FLEETW_UNIT_DATA': base64.b64encode(unit)}) 99 return subprocess.check_call('fleetctl.sh submit {name}.service'.format(**l), 100 shell=True, env=env) 101 102 def _create_announcer(self, name, image, command, template, env): 103 l = locals().copy() 104 l.update(re.match(MATCH, name).groupdict()) 105 env.update({'FLEETW_UNIT': name + '-announce' + '.service'}) 106 env.update({'FLEETW_UNIT_DATA': base64.b64encode(template.format(**l))}) 107 return subprocess.check_call('fleetctl.sh submit {name}-announce.service'.format(**l), # noqa 108 shell=True, env=env) 109 110 def _create_log(self, name, image, command, template, env): 111 l = locals().copy() 112 l.update(re.match(MATCH, name).groupdict()) 113 env.update({'FLEETW_UNIT': name + '-log' + '.service'}) 114 env.update({'FLEETW_UNIT_DATA': base64.b64encode(template.format(**l))}) 115 return subprocess.check_call('fleetctl.sh submit {name}-log.service'.format(**locals()), # noqa 116 shell=True, env=env) 117 118 def start(self, name, use_announcer=True): 119 """ 120 Start an idle job 121 """ 122 print 'Starting {name}'.format(**locals()) 123 env = self.env.copy() 124 self._start_container(name, env) 125 self._start_log(name, env) 126 127 if use_announcer: 128 self._start_announcer(name, env) 129 self._wait_for_announcer(name, env) 130 else: 131 self._log_skipped_announcer('start', name) 132 133 def _start_log(self, name, env): 134 subprocess.check_call( 135 'fleetctl.sh start -no-block {name}-log.service'.format(**locals()), 136 shell=True, env=env) 137 138 def _start_container(self, name, env): 139 return subprocess.check_call( 140 'fleetctl.sh start -no-block {name}.service'.format(**locals()), 141 shell=True, env=env) 142 143 def _start_announcer(self, name, env): 144 return subprocess.check_call( 145 'fleetctl.sh start -no-block {name}-announce.service'.format(**locals()), 146 shell=True, env=env) 147 148 def _wait_for_announcer(self, name, env): 149 status = None 150 # we bump to 20 minutes here to match the timeout on the router and in the app unit files 151 for _ in range(1200): 152 status = subprocess.check_output( 153 "fleetctl.sh list-units --no-legend --fields unit,sub | grep {name}-announce.service | awk '{{print $2}}'".format(**locals()), # noqa 154 shell=True, env=env).strip('\n') 155 if status == 'running': 156 break 157 time.sleep(1) 158 else: 159 raise RuntimeError('Container failed to start') 160 161 def stop(self, name, use_announcer=True): 162 """ 163 Stop a running job 164 """ 165 print 'Stopping {name}'.format(**locals()) 166 env = self.env.copy() 167 168 if use_announcer: 169 self._stop_announcer(name, env) 170 else: 171 self._log_skipped_announcer('stop', name) 172 173 self._stop_container(name, env) 174 self._stop_log(name, env) 175 176 def _stop_container(self, name, env): 177 return subprocess.check_call( 178 'fleetctl.sh stop -block-attempts=600 {name}.service'.format(**locals()), 179 shell=True, env=env) 180 181 def _stop_announcer(self, name, env): 182 return subprocess.check_call( 183 'fleetctl.sh stop -block-attempts=600 {name}-announce.service'.format(**locals()), 184 shell=True, env=env) 185 186 def _stop_log(self, name, env): 187 return subprocess.check_call( 188 'fleetctl.sh stop -block-attempts=600 {name}-log.service'.format(**locals()), 189 shell=True, env=env) 190 191 def destroy(self, name, use_announcer=True): 192 """ 193 Destroy an existing job 194 """ 195 print 'Destroying {name}'.format(**locals()) 196 env = self.env.copy() 197 198 if use_announcer: 199 self._destroy_announcer(name, env) 200 else: 201 self._log_skipped_announcer('destroy', name) 202 203 self._destroy_container(name, env) 204 self._destroy_log(name, env) 205 206 def _destroy_container(self, name, env): 207 return subprocess.check_call( 208 'fleetctl.sh destroy {name}.service'.format(**locals()), 209 shell=True, env=env) 210 211 def _destroy_announcer(self, name, env): 212 return subprocess.check_call( 213 'fleetctl.sh destroy {name}-announce.service'.format(**locals()), 214 shell=True, env=env) 215 216 def _destroy_log(self, name, env): 217 return subprocess.check_call( 218 'fleetctl.sh destroy {name}-log.service'.format(**locals()), 219 shell=True, env=env) 220 221 def run(self, name, image, command): 222 """ 223 Run a one-off command 224 """ 225 print 'Running {name}'.format(**locals()) 226 output = subprocess.PIPE 227 p = subprocess.Popen('fleetrun.sh {command}'.format(**locals()), shell=True, env=self.env, 228 stdout=output, stderr=subprocess.STDOUT) 229 rc = p.wait() 230 return rc, p.stdout.read() 231 232 def attach(self, name): 233 """ 234 Attach to a job's stdin, stdout and stderr 235 """ 236 return StringIO(), StringIO(), StringIO() 237 238 SchedulerClient = FleetClient 239 240 241 CONTAINER_TEMPLATE = """ 242 [Unit] 243 Description={name} 244 245 [Service] 246 ExecStartPre=/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; docker pull $IMAGE" 247 ExecStartPre=/bin/sh -c "docker inspect {name} >/dev/null 2>&1 && docker rm -f {name} || true" 248 ExecStart=/bin/sh -c "IMAGE=$(etcdctl get /deis/registry/host 2>&1):$(etcdctl get /deis/registry/port 2>&1)/{image}; port=$(docker inspect -f '{{{{range $k, $v := .ContainerConfig.ExposedPorts }}}}{{{{$k}}}}{{{{end}}}}' $IMAGE | cut -d/ -f1) ; docker run --name {name} {memory} {cpu} -P -e PORT=$port $IMAGE {command}" 249 ExecStop=/usr/bin/docker rm -f {name} 250 TimeoutStartSec=20m 251 """ # noqa 252 253 # TODO revisit the "not getting a port" issue after we upgrade to Docker 1.1.0 254 ANNOUNCE_TEMPLATE = """ 255 [Unit] 256 Description={name} announce 257 BindsTo={name}.service 258 259 [Service] 260 EnvironmentFile=/etc/environment 261 ExecStartPre=/bin/sh -c "until docker inspect -f '{{{{range $i, $e := .NetworkSettings.Ports }}}}{{{{$p := index $e 0}}}}{{{{$p.HostPort}}}}{{{{end}}}}' {name} >/dev/null 2>&1; do sleep 2; done; port=$(docker inspect -f '{{{{range $i, $e := .NetworkSettings.Ports }}}}{{{{$p := index $e 0}}}}{{{{$p.HostPort}}}}{{{{end}}}}' {name}); if [[ -z $port ]]; then echo We have no port...; exit 1; fi; echo Waiting for $port/tcp...; until netstat -lnt | grep :$port >/dev/null; do sleep 1; done" 262 ExecStart=/bin/sh -c "port=$(docker inspect -f '{{{{range $i, $e := .NetworkSettings.Ports }}}}{{{{$p := index $e 0}}}}{{{{$p.HostPort}}}}{{{{end}}}}' {name}); echo Connected to $COREOS_PRIVATE_IPV4:$port/tcp, publishing to etcd...; while netstat -lnt | grep :$port >/dev/null; do etcdctl set /deis/services/{app}/{name} $COREOS_PRIVATE_IPV4:$port --ttl 60 >/dev/null; sleep 45; done" 263 ExecStop=/usr/bin/etcdctl rm --recursive /deis/services/{app}/{name} 264 TimeoutStartSec=20m 265 266 [X-Fleet] 267 X-ConditionMachineOf={name}.service 268 """ # noqa 269 270 LOG_TEMPLATE = """ 271 [Unit] 272 Description={name} log 273 BindsTo={name}.service 274 275 [Service] 276 ExecStartPre=/bin/sh -c "until docker inspect {name} >/dev/null 2>&1; do sleep 1; done" 277 ExecStart=/bin/sh -c "docker logs -f {name} 2>&1 | logger -p local0.info -t {app}[{c_type}.{c_num}] --udp --server $(etcdctl get /deis/logs/host) --port $(etcdctl get /deis/logs/port)" 278 TimeoutStartSec=20m 279 280 [X-Fleet] 281 X-ConditionMachineOf={name}.service 282 """ # noqa