github.com/niedbalski/juju@v0.0.0-20190215020005-8ff100488e47/acceptancetests/assess_container_networking.py (about) 1 #!/usr/bin/env python 2 from __future__ import print_function 3 from argparse import ArgumentParser 4 import contextlib 5 from copy import ( 6 copy, 7 deepcopy, 8 ) 9 import logging 10 import re 11 import os 12 import subprocess 13 import sys 14 import tempfile 15 from textwrap import dedent 16 import time 17 18 from deploy_stack import ( 19 BootstrapManager, 20 get_random_string, 21 update_env, 22 ) 23 from jujupy import ( 24 KVM_MACHINE, 25 LXC_MACHINE, 26 LXD_MACHINE, 27 ) 28 from utility import ( 29 JujuAssertionError, 30 add_basic_testing_arguments, 31 configure_logging, 32 wait_for_port, 33 ) 34 35 36 __metaclass__ = type 37 38 39 log = logging.getLogger("assess_container_networking") 40 41 42 def parse_args(argv=None): 43 """Parse all arguments.""" 44 45 description = dedent("""\ 46 Test container address allocation. 47 For LXC and KVM, create machines of each type and test the network 48 between LXC <--> LXC, KVM <--> KVM and LXC <--> KVM. Also test machine 49 to outside world, DNS and that these tests still pass after a reboot. In 50 case of failure pull logs and configuration files from the machine that 51 we detected a problem on for later analysis. 52 """) 53 parser = add_basic_testing_arguments( 54 ArgumentParser(description=description), 55 existing=False) 56 parser.add_argument( 57 '--machine-type', 58 help='Which virtual machine/container type to test. Defaults to all.', 59 choices=[KVM_MACHINE, LXC_MACHINE, LXD_MACHINE]) 60 parser.add_argument( 61 '--space-constraint', 62 help='The network space to constrain containers to. Default is no space constraints.', 63 default=None, 64 dest='space') 65 args = parser.parse_args(argv) 66 return args 67 68 69 def ssh(client, machine, cmd): 70 """Convenience function: run a juju ssh command and get back the output 71 :param client: A Juju client 72 :param machine: ID of the machine on which to run a command 73 :param cmd: the command to run 74 :return: text output of the command 75 """ 76 back_off = 2 77 attempts = 4 78 for attempt in range(attempts): 79 try: 80 return client.get_juju_output('ssh', '--proxy', machine, cmd) 81 except subprocess.CalledProcessError as e: 82 # If the connection to the host failed, try again in a couple of 83 # seconds. This is usually due to heavy load. 84 if(attempt < attempts - 1 and 85 re.search('ssh_exchange_identification: ' 86 'Connection closed by remote host', e.stderr)): 87 time.sleep(back_off) 88 back_off *= 2 89 else: 90 raise 91 92 93 def make_machines(client, container_types, space): 94 """Make a test environment consisting of: 95 Two host machines. 96 Two of each container_type on one host machine. 97 One of each container_type on one host machine. 98 :param client: A ModelClient 99 :param container_types: list of containers to create 100 :return: hosts (list), containers {container_type}{host}[containers] 101 """ 102 # Find existing host machines 103 old_hosts = client.get_status().status['machines'] 104 machines_to_add = 2 - len(old_hosts) 105 106 # Allocate more hosts as needed 107 if machines_to_add > 0: 108 client.juju('add-machine', ('-n', str(machines_to_add))) 109 status = client.wait_for_started() 110 hosts = sorted(status.status['machines'].keys())[:2] 111 112 # Find existing containers 113 required = dict(zip(hosts, [copy(container_types) for h in hosts])) 114 required[hosts[0]] += container_types 115 for c in status.iter_machines(containers=True, machines=False): 116 host, type, id = c[0].split('/') 117 if type in required[host]: 118 required[host].remove(type) 119 120 # Start any new containers we need 121 sargs = [] 122 if space: 123 sargs = ['--constraints', 'spaces=' + space] 124 125 for host, containers in required.iteritems(): 126 for container in containers: 127 client.juju('add-machine', tuple(['{}:{}'.format(container, host)] + sargs)) 128 129 status = client.wait_for_started() 130 131 # Build a list of containers, now they have all started 132 tmp = dict(zip(hosts, [[] for h in hosts])) 133 containers = dict(zip(container_types, 134 [deepcopy(tmp) for t in container_types])) 135 for c in status.iter_machines(containers=True, machines=False): 136 host, type, id = c[0].split('/') 137 if type in containers and host in containers[type]: 138 containers[type][host].append(c[0]) 139 return hosts, containers 140 141 142 def find_network(client, machine, addr): 143 """Find a connected subnet containing the given address. 144 145 When using this to find the subnet of a container, don't use the container 146 as the machine to run the ip route show command on ("machine"), use a real 147 box because lxc will just send everything to its host machine, so it is on 148 a subnet containing itself. Not much use. 149 :param client: A Juju client 150 :param machine: ID of the machine on which to run a command 151 :param addr: find the connected subnet containing this address 152 :return: CIDR containing the address if found, else, None 153 """ 154 ip_cmd = ' '.join(['ip', 'route', 'show', 'to', 'match', addr]) 155 routes = ssh(client, machine, ip_cmd) 156 157 for route in re.findall(r'^(\S+).*[\d\.]+/\d+', routes, re.MULTILINE): 158 if route != 'default': 159 return route 160 161 raise ValueError("Unable to find route to %r" % addr) 162 163 164 def assess_network_traffic(client, targets): 165 """Test that all containers in target can talk to target[0] 166 :param client: Juju client 167 :param targets: machine IDs of machines to test 168 :return: None; 169 """ 170 status = client.wait_for_started().status 171 log.info('Assessing network traffic.') 172 source = targets[0] 173 dests = targets[1:] 174 175 with tempfile.NamedTemporaryFile(delete=False) as f: 176 f.write('tmux new-session -d -s test "nc -l 6778 > nc_listen.out"') 177 client.juju('scp', ('--proxy', f.name, source + ':/home/ubuntu/listen.sh')) 178 os.remove(f.name) 179 180 # Containers are named 'x/type/y' where x is the host of the container. We 181 host = source.split('/')[0] 182 address = status['machines'][host]['containers'][source]['dns-name'] 183 184 for dest in dests: 185 log.info('Assessing network traffic for {}.'.format(dest)) 186 msg = get_random_string() 187 ssh(client, source, 'rm nc_listen.out; bash ./listen.sh') 188 ssh(client, dest, 189 'echo "{msg}" | nc {addr} 6778'.format(msg=msg, addr=address)) 190 result = ssh(client, source, 'more nc_listen.out') 191 if msg not in result: 192 raise ValueError("Wrong or missing message: %r" % result.rstrip()) 193 log.info('SUCCESS.') 194 195 196 def private_address(client, host): 197 default_route = ssh(client, host, 'ip -4 -o route list 0/0') 198 log.info("Default route from {}: {}".format(host, default_route)) 199 # Match the device that is the word after 'dev'. eg. 200 # default via 10.0.30.1 dev br-eth1 onlink' 201 route_match = re.search(r'\sdev\s([\w-]+)', default_route) 202 if route_match is None: 203 raise JujuAssertionError( 204 "Failed to find device in {}".format(default_route)) 205 device = route_match.group(1) 206 log.info("Fetching the device IP of {}".format(device)) 207 device_ip = ssh(client, host, 'ip -4 -o addr show {}'.format(device)) 208 log.info("Device IP for {}: {}".format(host, device_ip)) 209 ip_match = re.search(r'inet\s+(\S+)/\d+\s', device_ip) 210 if ip_match is None: 211 raise JujuAssertionError( 212 "Failed to find ip for device: {}".format(device)) 213 return ip_match.group(1) 214 215 216 def assess_address_range(client, targets): 217 """Test that two containers are in the same subnet as their host 218 :param client: Juju client 219 :param targets: machine IDs of machines to test 220 :return: None; raises ValueError on failure 221 """ 222 log.info('Assessing address range.') 223 status = client.wait_for_started().status 224 225 host_subnet_cache = {} 226 227 for target in targets: 228 log.info('Assessing address range for {}.'.format(target)) 229 host = target.split('/')[0] 230 231 if host in host_subnet_cache: 232 host_subnet = host_subnet_cache[host] 233 else: 234 host_address = private_address(client, host) 235 host_subnet = find_network(client, host, host_address) 236 host_subnet_cache[host] = host_subnet 237 238 addr = status['machines'][host]['containers'][target]['dns-name'] 239 subnet = find_network(client, host, addr) 240 if host_subnet != subnet: 241 raise ValueError( 242 '{} ({}) not on the same subnet as {} ({})'.format( 243 target, subnet, host, host_subnet)) 244 log.info('SUCCESS.') 245 246 247 def assess_internet_connection(client, targets): 248 """Test that targets can ping their default route 249 :param client: Juju client 250 :param targets: machine IDs of machines to test 251 :return: None; raises ValueError on failure 252 """ 253 log.info('Assessing internet connection.') 254 for target in targets: 255 log.info("Assessing internet connection for {}".format(target)) 256 routes = ssh(client, target, 'ip route show') 257 258 d = re.search(r'^default\s+via\s+([\d\.]+)\s+', routes, re.MULTILINE) 259 if d: 260 rc, _ = client.juju( 261 'ssh', 262 ('--proxy', target, 'ping -c1 -q ' + d.group(1)), check=False) 263 if rc != 0: 264 raise ValueError('%s unable to ping default route' % target) 265 else: 266 raise ValueError("Default route not found") 267 log.info("SUCCESS") 268 269 270 def _assessment_iteration(client, containers): 271 """Run the network tests on this collection of machines and containers 272 :param client: Juju client 273 :param hosts: list of hosts of containers 274 :param containers: list of containers to run tests between 275 :return: None 276 """ 277 assess_internet_connection(client, containers) 278 assess_address_range(client, containers) 279 assess_network_traffic(client, containers) 280 281 282 def _assess_container_networking(client, types, hosts, containers): 283 """Run _assessment_iteration on all useful combinations of containers 284 :param client: Juju client 285 :param args: Parsed command line arguments 286 :return: None 287 """ 288 for container_type in types: 289 # Test with two containers on the same host 290 _assessment_iteration(client, containers[container_type][hosts[0]]) 291 292 # Now test with two containers on two different hosts 293 test_containers = [ 294 containers[container_type][hosts[0]][0], 295 containers[container_type][hosts[1]][0], 296 ] 297 _assessment_iteration(client, test_containers) 298 299 if KVM_MACHINE in types and LXC_MACHINE in types: 300 test_containers = [ 301 containers[LXC_MACHINE][hosts[0]][0], 302 containers[KVM_MACHINE][hosts[0]][0], 303 ] 304 _assessment_iteration(client, test_containers) 305 306 # Test with an LXC and a KVM on different machines 307 test_containers = [ 308 containers[LXC_MACHINE][hosts[0]][0], 309 containers[KVM_MACHINE][hosts[1]][0], 310 ] 311 _assessment_iteration(client, test_containers) 312 313 314 def get_uptime(client, host): 315 uptime_pattern = re.compile(r'.*(\d+)') 316 uptime_output = ssh(client, host, 'uptime -p') 317 log.info('uptime -p: {}'.format(uptime_output)) 318 match = uptime_pattern.match(uptime_output) 319 if match: 320 return int(match.group(1)) 321 else: 322 return 0 323 324 325 def assess_container_networking(client, types, space): 326 """Runs _assess_address_allocation, reboots hosts, repeat. 327 328 :param client: Juju client 329 :param types: Container types to test 330 :return: None 331 """ 332 log.info("Setting up test.") 333 hosts, containers = make_machines(client, types, space) 334 status = client.wait_for_started().status 335 log.info("Setup complete.") 336 log.info("Test started.") 337 338 _assess_container_networking(client, types, hosts, containers) 339 340 # Reboot all hosted modelled machines then the controller. 341 log.info("Instrumenting reboot of all machines.") 342 try: 343 for host in hosts: 344 log.info("Restarting hosted machine: {}".format(host)) 345 client.juju( 346 'run', ('--machine', host, 'sudo shutdown -r +1')) 347 client.juju('show-action-status', ('--name', 'juju-run')) 348 349 log.info("Restarting controller machine 0") 350 controller_client = client.get_controller_client() 351 controller_status = controller_client.get_status() 352 controller_host = controller_status.status['machines']['0']['dns-name'] 353 first_uptime = get_uptime(controller_client, '0') 354 ssh(controller_client, '0', 'sudo shutdown -r +1') 355 # Ensure the reboots have started. 356 time.sleep(70) 357 except subprocess.CalledProcessError as e: 358 logging.info( 359 "Error running shutdown:\nstdout: %s\nstderr: %s", 360 e.output, getattr(e, 'stderr', None)) 361 raise 362 363 # Wait for the controller to shut down if it has not yet restarted. 364 # This ensure the call to wait_for_started happens after each host 365 # has restarted. 366 second_uptime = get_uptime(controller_client, '0') 367 if second_uptime > first_uptime: 368 wait_for_port(controller_host, 22, closed=True, timeout=300) 369 client.wait_for_started() 370 371 # Once Juju is up it can take a little while before ssh responds. 372 for host in hosts: 373 hostname = status['machines'][host]['dns-name'] 374 wait_for_port(hostname, 22, timeout=240) 375 log.info("Reboot complete and all hosts ready for retest.") 376 377 _assess_container_networking(client, types, hosts, containers) 378 log.info("PASS") 379 380 381 @contextlib.contextmanager 382 def cleaned_bootstrap_context(bs_manager, args): 383 client = bs_manager.client 384 # TODO(gz): Having to manipulate client env state here to get the temp env 385 # is ugly, would ideally be captured in an explicit scope. 386 update_env(client.env, bs_manager.temp_env_name, series=bs_manager.series, 387 agent_url=bs_manager.agent_url, 388 agent_stream=bs_manager.agent_stream, region=bs_manager.region) 389 with bs_manager.top_context() as machines: 390 with bs_manager.bootstrap_context(machines): 391 client.bootstrap(args.upload_tools) 392 with bs_manager.runtime_context(machines): 393 yield 394 395 396 def _get_container_types(client, machine_type): 397 """ 398 Give list of container types to run testing against. 399 400 If a machine_type was explicitly specified, only test against those kind 401 of containers. Otherwise, test all possible containers for the given 402 juju version. 403 """ 404 if machine_type: 405 if machine_type not in client.supported_container_types: 406 raise Exception( 407 "no {} support on juju {}".format(machine_type, 408 client.version)) 409 return [machine_type] 410 # TODO(gz): Only include LXC for 1.X clients 411 types = list(client.supported_container_types) 412 types.sort() 413 return types 414 415 416 def main(argv=None): 417 args = parse_args(argv) 418 configure_logging(args.verbose) 419 bs_manager = BootstrapManager.from_args(args) 420 client = bs_manager.client 421 machine_types = _get_container_types(client, args.machine_type) 422 with cleaned_bootstrap_context(bs_manager, args): 423 assess_container_networking(bs_manager.client, machine_types, args.space) 424 return 0 425 426 427 if __name__ == '__main__': 428 sys.exit(main())