github.com/juju/juju@v0.0.0-20240327075706-a90865de2538/acceptancetests/assess_container_networking.py (about) 1 #!/usr/bin/env python3 2 from __future__ import print_function 3 from argparse import ArgumentParser 4 import contextlib 5 from copy import ( 6 copy, 7 deepcopy, 8 ) 9 import logging 10 import re 11 import os 12 import subprocess 13 import sys 14 import tempfile 15 from textwrap import dedent 16 import time 17 18 from utility import ( 19 JujuAssertionError, 20 add_basic_testing_arguments, 21 configure_logging, 22 wait_for_port, 23 ) 24 from deploy_stack import ( 25 BootstrapManager, 26 get_random_string, 27 update_env, 28 ) 29 from jujupy import ( 30 KVM_MACHINE, 31 LXC_MACHINE, 32 LXD_MACHINE, 33 ) 34 35 36 __metaclass__ = type 37 38 39 log = logging.getLogger("assess_container_networking") 40 41 42 def parse_args(argv=None): 43 """Parse all arguments.""" 44 45 description = dedent("""\ 46 Test container address allocation. 47 For LXC and KVM, create machines of each type and test the network 48 between LXC <--> LXC, KVM <--> KVM and LXC <--> KVM. Also test machine 49 to outside world, DNS and that these tests still pass after a reboot. In 50 case of failure pull logs and configuration files from the machine that 51 we detected a problem on for later analysis. 52 """) 53 parser = add_basic_testing_arguments( 54 ArgumentParser(description=description), 55 existing=False) 56 parser.add_argument( 57 '--machine-type', 58 help='Which virtual machine/container type to test. Defaults to all.', 59 choices=[KVM_MACHINE, LXC_MACHINE, LXD_MACHINE]) 60 parser.add_argument( 61 '--space-constraint', 62 help=('The network space to constrain containers to. ' 63 'Default is no space constraints.'), 64 default=None, 65 dest='space') 66 args = parser.parse_args(argv) 67 return args 68 69 70 def ssh(client, machine, cmd): 71 """Convenience function: run a juju ssh command and get back the output 72 :param client: A Juju client 73 :param machine: ID of the machine on which to run a command 74 :param cmd: the command to run 75 :return: text output of the command 76 """ 77 back_off = 2 78 attempts = 4 79 for attempt in range(attempts): 80 try: 81 return client.get_juju_output('ssh', '--proxy', machine, cmd) 82 except subprocess.CalledProcessError as e: 83 # If the connection to the host failed, try again in a couple of 84 # seconds. This is usually due to heavy load. 85 if(attempt < attempts - 1 and 86 re.search('ssh_exchange_identification: ' 87 'Connection closed by remote host', e.stderr)): 88 time.sleep(back_off) 89 back_off *= 2 90 else: 91 raise 92 93 94 def make_machines(client, container_types, space): 95 """Make a test environment consisting of: 96 Two host machines. 97 Two of each container_type on one host machine. 98 One of each container_type on one host machine. 99 :param client: A ModelClient 100 :param container_types: list of containers to create 101 :return: hosts (list), containers {container_type}{host}[containers] 102 """ 103 # Find existing host machines 104 old_hosts = client.get_status().status['machines'] 105 machines_to_add = 2 - len(old_hosts) 106 107 # Allocate more hosts as needed 108 if machines_to_add > 0: 109 client.juju('add-machine', ('-n', str(machines_to_add))) 110 status = client.wait_for_started() 111 hosts = sorted(status.status['machines'].keys())[:2] 112 113 # Find existing containers 114 required = dict(zip(hosts, [copy(container_types) for h in hosts])) 115 required[hosts[0]] += container_types 116 for c in status.iter_machines(containers=True, machines=False): 117 host, type, id = c[0].split('/') 118 if type in required[host]: 119 required[host].remove(type) 120 121 # Start any new containers we need 122 sargs = [] 123 if space: 124 sargs = ['--constraints', 'spaces=' + space] 125 126 for host, containers in iter(required.items()): 127 for container in containers: 128 client.juju('add-machine', 129 tuple(['{}:{}'.format(container, host)] + sargs)) 130 131 status = client.wait_for_started() 132 133 # Build a list of containers, now they have all started 134 tmp = dict(zip(hosts, [[] for h in hosts])) 135 containers = dict(zip(container_types, 136 [deepcopy(tmp) for t in container_types])) 137 for c in status.iter_machines(containers=True, machines=False): 138 host, type, id = c[0].split('/') 139 if type in containers and host in containers[type]: 140 containers[type][host].append(c[0]) 141 return hosts, containers 142 143 144 def find_network(client, machine, addr): 145 """Find a connected subnet containing the given address. 146 147 When using this to find the subnet of a container, don't use the container 148 as the machine to run the ip route show command on ("machine"), use a real 149 box because lxc will just send everything to its host machine, so it is on 150 a subnet containing itself. Not much use. 151 :param client: A Juju client 152 :param machine: ID of the machine on which to run a command 153 :param addr: find the connected subnet containing this address 154 :return: CIDR containing the address if found, else, None 155 """ 156 ip_cmd = ' '.join(['ip', 'route', 'show', 'to', 'match', addr]) 157 routes = ssh(client, machine, ip_cmd) 158 159 for route in re.findall(r'^(\S+).*[\d\.]+/\d+', routes, re.MULTILINE): 160 if route != 'default': 161 return route 162 163 raise ValueError("Unable to find route to %r" % addr) 164 165 166 def assess_network_traffic(client, targets): 167 """Test that all containers in target can talk to target[0] 168 :param client: Juju client 169 :param targets: machine IDs of machines to test 170 :return: None; 171 """ 172 status = client.wait_for_started().status 173 log.info('Assessing network traffic.') 174 source = targets[0] 175 dests = targets[1:] 176 177 with tempfile.NamedTemporaryFile(delete=False) as f: 178 f.write( 179 'tmux new-session -d -s test "nc -l 6778 > nc_listen.out"'.encode( 180 'utf-8')) 181 client.juju('scp', ('--proxy', f.name, source + ':/home/ubuntu/listen.sh')) 182 os.remove(f.name) 183 184 # Containers are named 'x/type/y' where x is the host of the container. We 185 host = source.split('/')[0] 186 address = status['machines'][host]['containers'][source]['dns-name'] 187 188 for dest in dests: 189 log.info('Assessing network traffic for {}.'.format(dest)) 190 msg = get_random_string() 191 ssh(client, source, 'rm nc_listen.out; bash ./listen.sh') 192 ssh(client, dest, 193 'echo "{msg}" | nc -q 0 {addr} 6778'.format(msg=msg, addr=address)) 194 # This command will block until *any* data appears in the file, tee the 195 # output and return control back to us. 196 result = ssh(client, source, 'tail -F nc_listen.out | sed "/.*/ q"') 197 if msg not in result: 198 raise ValueError("Wrong or missing message: %r" % result.rstrip()) 199 log.info('SUCCESS.') 200 201 202 def private_address(client, host): 203 default_route = ssh(client, host, 'ip -4 -o route list 0/0') 204 log.info("Default route from {}: {}".format(host, default_route)) 205 # Match the device that is the word after 'dev'. eg. 206 # default via 10.0.30.1 dev br-eth1 onlink' 207 route_match = re.search(r'\sdev\s([\w-]+)', default_route) 208 if route_match is None: 209 raise JujuAssertionError( 210 "Failed to find device in {}".format(default_route)) 211 device = route_match.group(1) 212 log.info("Fetching the device IP of {}".format(device)) 213 device_ip = ssh(client, host, 'ip -4 -o addr show {}'.format(device)) 214 log.info("Device IP for {}: {}".format(host, device_ip)) 215 ip_match = re.search(r'inet\s+(\S+)/\d+\s', device_ip) 216 if ip_match is None: 217 raise JujuAssertionError( 218 "Failed to find ip for device: {}".format(device)) 219 return ip_match.group(1) 220 221 222 def assess_address_range(client, targets): 223 """Test that two containers are in the same subnet as their host 224 :param client: Juju client 225 :param targets: machine IDs of machines to test 226 :return: None; raises ValueError on failure 227 """ 228 log.info('Assessing address range.') 229 status = client.wait_for_started().status 230 231 host_subnet_cache = {} 232 233 for target in targets: 234 log.info('Assessing address range for {}.'.format(target)) 235 host = target.split('/')[0] 236 237 if host in host_subnet_cache: 238 host_subnet = host_subnet_cache[host] 239 else: 240 host_address = private_address(client, host) 241 host_subnet = find_network(client, host, host_address) 242 host_subnet_cache[host] = host_subnet 243 244 addr = status['machines'][host]['containers'][target]['dns-name'] 245 subnet = find_network(client, host, addr) 246 if host_subnet != subnet: 247 raise ValueError( 248 '{} ({}) not on the same subnet as {} ({})'.format( 249 target, subnet, host, host_subnet)) 250 log.info('SUCCESS.') 251 252 253 def assess_internet_connection(client, targets): 254 """Test that targets can ping their default route 255 :param client: Juju client 256 :param targets: machine IDs of machines to test 257 :return: None; raises ValueError on failure 258 """ 259 log.info('Assessing internet connection.') 260 for target in targets: 261 log.info("Assessing internet connection for {}".format(target)) 262 routes = ssh(client, target, 'ip route show') 263 264 d = re.search(r'^default\s+via\s+([\d\.]+)\s+', routes, re.MULTILINE) 265 if d: 266 rc, _ = client.juju( 267 'ssh', 268 ('--proxy', target, 'ping -c1 -q ' + d.group(1)), check=False) 269 if rc != 0: 270 raise ValueError('%s unable to ping default route' % target) 271 else: 272 raise ValueError("Default route not found") 273 log.info("SUCCESS") 274 275 276 def _assessment_iteration(client, containers): 277 """Run the network tests on this collection of machines and containers 278 :param client: Juju client 279 :param hosts: list of hosts of containers 280 :param containers: list of containers to run tests between 281 :return: None 282 """ 283 assess_internet_connection(client, containers) 284 assess_address_range(client, containers) 285 assess_network_traffic(client, containers) 286 287 288 def _assess_container_networking(client, types, hosts, containers): 289 """Run _assessment_iteration on all useful combinations of containers 290 :param client: Juju client 291 :param args: Parsed command line arguments 292 :return: None 293 """ 294 for container_type in types: 295 # Test with two containers on the same host 296 _assessment_iteration(client, containers[container_type][hosts[0]]) 297 298 # Now test with two containers on two different hosts 299 test_containers = [ 300 containers[container_type][hosts[0]][0], 301 containers[container_type][hosts[1]][0], 302 ] 303 _assessment_iteration(client, test_containers) 304 305 if KVM_MACHINE in types and LXC_MACHINE in types: 306 test_containers = [ 307 containers[LXC_MACHINE][hosts[0]][0], 308 containers[KVM_MACHINE][hosts[0]][0], 309 ] 310 _assessment_iteration(client, test_containers) 311 312 # Test with an LXC and a KVM on different machines 313 test_containers = [ 314 containers[LXC_MACHINE][hosts[0]][0], 315 containers[KVM_MACHINE][hosts[1]][0], 316 ] 317 _assessment_iteration(client, test_containers) 318 319 320 def get_uptime(client, host): 321 uptime_pattern = re.compile(r'.*?([\d]+)') 322 uptime_output = ssh(client, host, 'uptime -p') 323 log.info('uptime -p: {}'.format(uptime_output)) 324 match = uptime_pattern.match(uptime_output) 325 if match: 326 return int(match.group(1)) 327 else: 328 return 0 329 330 331 def assess_container_networking(client, types, space): 332 """Runs _assess_address_allocation, reboots hosts, repeat. 333 334 :param client: Juju client 335 :param types: Container types to test 336 :return: None 337 """ 338 log.info("Setting up test.") 339 hosts, containers = make_machines(client, types, space) 340 status = client.wait_for_started().status 341 log.info("Setup complete.") 342 log.info("Test started.") 343 344 _assess_container_networking(client, types, hosts, containers) 345 346 # Reboot all hosted modelled machines then the controller. 347 log.info("Instrumenting reboot of all machines.") 348 try: 349 for host in hosts: 350 log.info("Restarting hosted machine: {}".format(host)) 351 client.reboot(host) 352 353 log.info("Restarting controller machine 0") 354 controller_client = client.get_controller_client() 355 controller_status = controller_client.get_status() 356 controller_host = controller_status.status['machines']['0']['dns-name'] 357 first_uptime = get_uptime(controller_client, '0') 358 controller_client.reboot('0') 359 # Ensure the reboots have started. 360 time.sleep(70) 361 except subprocess.CalledProcessError as e: 362 logging.info( 363 "Error running shutdown:\nstdout: %s\nstderr: %s", 364 e.output, getattr(e, 'stderr', None)) 365 raise 366 367 # Wait for the controller to shut down if it has not yet restarted. 368 # This ensure the call to wait_for_started happens after each host 369 # has restarted. 370 second_uptime = get_uptime(controller_client, '0') 371 if second_uptime > first_uptime: 372 wait_for_port(controller_host, 22, closed=True, timeout=300) 373 client.wait_for_started() 374 375 # Once Juju is up it can take a little while before ssh responds. 376 for host in hosts: 377 hostname = status['machines'][host]['dns-name'] 378 wait_for_port(hostname, 22, timeout=240) 379 log.info("Reboot complete and all hosts ready for retest.") 380 381 _assess_container_networking(client, types, hosts, containers) 382 log.info("PASS") 383 384 385 @contextlib.contextmanager 386 def cleaned_bootstrap_context(bs_manager, args): 387 client = bs_manager.client 388 # TODO(gz): Having to manipulate client env state here to get the temp env 389 # is ugly, would ideally be captured in an explicit scope. 390 update_env(client.env, bs_manager.temp_env_name, series=bs_manager.series, 391 agent_url=bs_manager.agent_url, 392 agent_stream=bs_manager.agent_stream, region=bs_manager.region) 393 with bs_manager.top_context() as machines: 394 with bs_manager.bootstrap_context(machines): 395 client.bootstrap(args.upload_tools) 396 with bs_manager.runtime_context(machines): 397 yield 398 399 400 def _get_container_types(client, machine_type): 401 """ 402 Give list of container types to run testing against. 403 404 If a machine_type was explicitly specified, only test against those kind 405 of containers. Otherwise, test all possible containers for the given 406 juju version. 407 """ 408 if machine_type: 409 if machine_type not in client.supported_container_types: 410 raise Exception( 411 "no {} support on juju {}".format(machine_type, 412 client.version)) 413 return [machine_type] 414 # TODO(gz): Only include LXC for 1.X clients 415 types = list(client.supported_container_types) 416 types.sort() 417 return types 418 419 420 def main(argv=None): 421 args = parse_args(argv) 422 configure_logging(args.verbose) 423 bs_manager = BootstrapManager.from_args(args) 424 client = bs_manager.client 425 machine_types = _get_container_types(client, args.machine_type) 426 with cleaned_bootstrap_context(bs_manager, args): 427 assess_container_networking(bs_manager.client, machine_types, 428 args.space) 429 return 0 430 431 432 if __name__ == '__main__': 433 sys.exit(main())