github.com/juju/juju@v0.0.0-20240327075706-a90865de2538/acceptancetests/assess_container_networking.py (about)

     1  #!/usr/bin/env python3
     2  from __future__ import print_function
     3  from argparse import ArgumentParser
     4  import contextlib
     5  from copy import (
     6      copy,
     7      deepcopy,
     8      )
     9  import logging
    10  import re
    11  import os
    12  import subprocess
    13  import sys
    14  import tempfile
    15  from textwrap import dedent
    16  import time
    17  
    18  from utility import (
    19      JujuAssertionError,
    20      add_basic_testing_arguments,
    21      configure_logging,
    22      wait_for_port,
    23      )
    24  from deploy_stack import (
    25      BootstrapManager,
    26      get_random_string,
    27      update_env,
    28      )
    29  from jujupy import (
    30      KVM_MACHINE,
    31      LXC_MACHINE,
    32      LXD_MACHINE,
    33      )
    34  
    35  
    36  __metaclass__ = type
    37  
    38  
    39  log = logging.getLogger("assess_container_networking")
    40  
    41  
    42  def parse_args(argv=None):
    43      """Parse all arguments."""
    44  
    45      description = dedent("""\
    46      Test container address allocation.
    47      For LXC and KVM, create machines of each type and test the network
    48      between LXC <--> LXC, KVM <--> KVM and LXC <--> KVM. Also test machine
    49      to outside world, DNS and that these tests still pass after a reboot. In
    50      case of failure pull logs and configuration files from the machine that
    51      we detected a problem on for later analysis.
    52      """)
    53      parser = add_basic_testing_arguments(
    54          ArgumentParser(description=description),
    55          existing=False)
    56      parser.add_argument(
    57          '--machine-type',
    58          help='Which virtual machine/container type to test. Defaults to all.',
    59          choices=[KVM_MACHINE, LXC_MACHINE, LXD_MACHINE])
    60      parser.add_argument(
    61          '--space-constraint',
    62          help=('The network space to constrain containers to. '
    63                'Default is no space constraints.'),
    64          default=None,
    65          dest='space')
    66      args = parser.parse_args(argv)
    67      return args
    68  
    69  
    70  def ssh(client, machine, cmd):
    71      """Convenience function: run a juju ssh command and get back the output
    72      :param client: A Juju client
    73      :param machine: ID of the machine on which to run a command
    74      :param cmd: the command to run
    75      :return: text output of the command
    76      """
    77      back_off = 2
    78      attempts = 4
    79      for attempt in range(attempts):
    80          try:
    81              return client.get_juju_output('ssh', '--proxy', machine, cmd)
    82          except subprocess.CalledProcessError as e:
    83              # If the connection to the host failed, try again in a couple of
    84              # seconds. This is usually due to heavy load.
    85              if(attempt < attempts - 1 and
    86                  re.search('ssh_exchange_identification: '
    87                            'Connection closed by remote host', e.stderr)):
    88                  time.sleep(back_off)
    89                  back_off *= 2
    90              else:
    91                  raise
    92  
    93  
    94  def make_machines(client, container_types, space):
    95      """Make a test environment consisting of:
    96         Two host machines.
    97         Two of each container_type on one host machine.
    98         One of each container_type on one host machine.
    99      :param client: A ModelClient
   100      :param container_types: list of containers to create
   101      :return: hosts (list), containers {container_type}{host}[containers]
   102      """
   103      # Find existing host machines
   104      old_hosts = client.get_status().status['machines']
   105      machines_to_add = 2 - len(old_hosts)
   106  
   107      # Allocate more hosts as needed
   108      if machines_to_add > 0:
   109          client.juju('add-machine', ('-n', str(machines_to_add)))
   110      status = client.wait_for_started()
   111      hosts = sorted(status.status['machines'].keys())[:2]
   112  
   113      # Find existing containers
   114      required = dict(zip(hosts, [copy(container_types) for h in hosts]))
   115      required[hosts[0]] += container_types
   116      for c in status.iter_machines(containers=True, machines=False):
   117          host, type, id = c[0].split('/')
   118          if type in required[host]:
   119              required[host].remove(type)
   120  
   121      # Start any new containers we need
   122      sargs = []
   123      if space:
   124          sargs = ['--constraints', 'spaces=' + space]
   125  
   126      for host, containers in iter(required.items()):
   127          for container in containers:
   128              client.juju('add-machine',
   129                          tuple(['{}:{}'.format(container, host)] + sargs))
   130  
   131      status = client.wait_for_started()
   132  
   133      # Build a list of containers, now they have all started
   134      tmp = dict(zip(hosts, [[] for h in hosts]))
   135      containers = dict(zip(container_types,
   136                            [deepcopy(tmp) for t in container_types]))
   137      for c in status.iter_machines(containers=True, machines=False):
   138          host, type, id = c[0].split('/')
   139          if type in containers and host in containers[type]:
   140              containers[type][host].append(c[0])
   141      return hosts, containers
   142  
   143  
   144  def find_network(client, machine, addr):
   145      """Find a connected subnet containing the given address.
   146  
   147      When using this to find the subnet of a container, don't use the container
   148      as the machine to run the ip route show command on ("machine"), use a real
   149      box because lxc will just send everything to its host machine, so it is on
   150      a subnet containing itself. Not much use.
   151      :param client: A Juju client
   152      :param machine: ID of the machine on which to run a command
   153      :param addr: find the connected subnet containing this address
   154      :return: CIDR containing the address if found, else, None
   155      """
   156      ip_cmd = ' '.join(['ip', 'route', 'show', 'to', 'match', addr])
   157      routes = ssh(client, machine, ip_cmd)
   158  
   159      for route in re.findall(r'^(\S+).*[\d\.]+/\d+', routes, re.MULTILINE):
   160          if route != 'default':
   161              return route
   162  
   163      raise ValueError("Unable to find route to %r" % addr)
   164  
   165  
   166  def assess_network_traffic(client, targets):
   167      """Test that all containers in target can talk to target[0]
   168      :param client: Juju client
   169      :param targets: machine IDs of machines to test
   170      :return: None;
   171      """
   172      status = client.wait_for_started().status
   173      log.info('Assessing network traffic.')
   174      source = targets[0]
   175      dests = targets[1:]
   176  
   177      with tempfile.NamedTemporaryFile(delete=False) as f:
   178          f.write(
   179              'tmux new-session -d -s test "nc -l 6778 > nc_listen.out"'.encode(
   180                  'utf-8'))
   181      client.juju('scp', ('--proxy', f.name, source + ':/home/ubuntu/listen.sh'))
   182      os.remove(f.name)
   183  
   184      # Containers are named 'x/type/y' where x is the host of the container. We
   185      host = source.split('/')[0]
   186      address = status['machines'][host]['containers'][source]['dns-name']
   187  
   188      for dest in dests:
   189          log.info('Assessing network traffic for {}.'.format(dest))
   190          msg = get_random_string()
   191          ssh(client, source, 'rm nc_listen.out; bash ./listen.sh')
   192          ssh(client, dest,
   193              'echo "{msg}" | nc -q 0 {addr} 6778'.format(msg=msg, addr=address))
   194          # This command will block until *any* data appears in the file, tee the
   195          # output and return control back to us.
   196          result = ssh(client, source, 'tail -F nc_listen.out | sed "/.*/ q"')
   197          if msg not in result:
   198              raise ValueError("Wrong or missing message: %r" % result.rstrip())
   199          log.info('SUCCESS.')
   200  
   201  
   202  def private_address(client, host):
   203      default_route = ssh(client, host, 'ip -4 -o route list 0/0')
   204      log.info("Default route from {}: {}".format(host, default_route))
   205      # Match the device that is the word after 'dev'. eg.
   206      # default via 10.0.30.1 dev br-eth1 onlink'
   207      route_match = re.search(r'\sdev\s([\w-]+)', default_route)
   208      if route_match is None:
   209          raise JujuAssertionError(
   210              "Failed to find device in {}".format(default_route))
   211      device = route_match.group(1)
   212      log.info("Fetching the device IP of {}".format(device))
   213      device_ip = ssh(client, host, 'ip -4 -o addr show {}'.format(device))
   214      log.info("Device IP for {}: {}".format(host, device_ip))
   215      ip_match = re.search(r'inet\s+(\S+)/\d+\s', device_ip)
   216      if ip_match is None:
   217          raise JujuAssertionError(
   218              "Failed to find ip for device: {}".format(device))
   219      return ip_match.group(1)
   220  
   221  
   222  def assess_address_range(client, targets):
   223      """Test that two containers are in the same subnet as their host
   224      :param client: Juju client
   225      :param targets: machine IDs of machines to test
   226      :return: None; raises ValueError on failure
   227      """
   228      log.info('Assessing address range.')
   229      status = client.wait_for_started().status
   230  
   231      host_subnet_cache = {}
   232  
   233      for target in targets:
   234          log.info('Assessing address range for {}.'.format(target))
   235          host = target.split('/')[0]
   236  
   237          if host in host_subnet_cache:
   238              host_subnet = host_subnet_cache[host]
   239          else:
   240              host_address = private_address(client, host)
   241              host_subnet = find_network(client, host, host_address)
   242              host_subnet_cache[host] = host_subnet
   243  
   244          addr = status['machines'][host]['containers'][target]['dns-name']
   245          subnet = find_network(client, host, addr)
   246          if host_subnet != subnet:
   247              raise ValueError(
   248                  '{} ({}) not on the same subnet as {} ({})'.format(
   249                      target, subnet, host, host_subnet))
   250          log.info('SUCCESS.')
   251  
   252  
   253  def assess_internet_connection(client, targets):
   254      """Test that targets can ping their default route
   255      :param client: Juju client
   256      :param targets: machine IDs of machines to test
   257      :return: None; raises ValueError on failure
   258      """
   259      log.info('Assessing internet connection.')
   260      for target in targets:
   261          log.info("Assessing internet connection for {}".format(target))
   262          routes = ssh(client, target, 'ip route show')
   263  
   264          d = re.search(r'^default\s+via\s+([\d\.]+)\s+', routes, re.MULTILINE)
   265          if d:
   266              rc, _ = client.juju(
   267                  'ssh',
   268                  ('--proxy', target, 'ping -c1 -q ' + d.group(1)), check=False)
   269              if rc != 0:
   270                  raise ValueError('%s unable to ping default route' % target)
   271          else:
   272              raise ValueError("Default route not found")
   273          log.info("SUCCESS")
   274  
   275  
   276  def _assessment_iteration(client, containers):
   277      """Run the network tests on this collection of machines and containers
   278      :param client: Juju client
   279      :param hosts: list of hosts of containers
   280      :param containers: list of containers to run tests between
   281      :return: None
   282      """
   283      assess_internet_connection(client, containers)
   284      assess_address_range(client, containers)
   285      assess_network_traffic(client, containers)
   286  
   287  
   288  def _assess_container_networking(client, types, hosts, containers):
   289      """Run _assessment_iteration on all useful combinations of containers
   290      :param client: Juju client
   291      :param args: Parsed command line arguments
   292      :return: None
   293      """
   294      for container_type in types:
   295          # Test with two containers on the same host
   296          _assessment_iteration(client, containers[container_type][hosts[0]])
   297  
   298          # Now test with two containers on two different hosts
   299          test_containers = [
   300              containers[container_type][hosts[0]][0],
   301              containers[container_type][hosts[1]][0],
   302          ]
   303          _assessment_iteration(client, test_containers)
   304  
   305      if KVM_MACHINE in types and LXC_MACHINE in types:
   306          test_containers = [
   307              containers[LXC_MACHINE][hosts[0]][0],
   308              containers[KVM_MACHINE][hosts[0]][0],
   309          ]
   310          _assessment_iteration(client, test_containers)
   311  
   312          # Test with an LXC and a KVM on different machines
   313          test_containers = [
   314              containers[LXC_MACHINE][hosts[0]][0],
   315              containers[KVM_MACHINE][hosts[1]][0],
   316          ]
   317          _assessment_iteration(client, test_containers)
   318  
   319  
   320  def get_uptime(client, host):
   321      uptime_pattern = re.compile(r'.*?([\d]+)')
   322      uptime_output = ssh(client, host, 'uptime -p')
   323      log.info('uptime -p: {}'.format(uptime_output))
   324      match = uptime_pattern.match(uptime_output)
   325      if match:
   326          return int(match.group(1))
   327      else:
   328          return 0
   329  
   330  
   331  def assess_container_networking(client, types, space):
   332      """Runs _assess_address_allocation, reboots hosts, repeat.
   333  
   334      :param client: Juju client
   335      :param types: Container types to test
   336      :return: None
   337      """
   338      log.info("Setting up test.")
   339      hosts, containers = make_machines(client, types, space)
   340      status = client.wait_for_started().status
   341      log.info("Setup complete.")
   342      log.info("Test started.")
   343  
   344      _assess_container_networking(client, types, hosts, containers)
   345  
   346      # Reboot all hosted modelled machines then the controller.
   347      log.info("Instrumenting reboot of all machines.")
   348      try:
   349          for host in hosts:
   350              log.info("Restarting hosted machine: {}".format(host))
   351              client.reboot(host)
   352  
   353          log.info("Restarting controller machine 0")
   354          controller_client = client.get_controller_client()
   355          controller_status = controller_client.get_status()
   356          controller_host = controller_status.status['machines']['0']['dns-name']
   357          first_uptime = get_uptime(controller_client, '0')
   358          controller_client.reboot('0')
   359          # Ensure the reboots have started.
   360          time.sleep(70)
   361      except subprocess.CalledProcessError as e:
   362          logging.info(
   363              "Error running shutdown:\nstdout: %s\nstderr: %s",
   364              e.output, getattr(e, 'stderr', None))
   365          raise
   366  
   367      # Wait for the controller to shut down if it has not yet restarted.
   368      # This ensure the call to wait_for_started happens after each host
   369      # has restarted.
   370      second_uptime = get_uptime(controller_client, '0')
   371      if second_uptime > first_uptime:
   372          wait_for_port(controller_host, 22, closed=True, timeout=300)
   373      client.wait_for_started()
   374  
   375      # Once Juju is up it can take a little while before ssh responds.
   376      for host in hosts:
   377          hostname = status['machines'][host]['dns-name']
   378          wait_for_port(hostname, 22, timeout=240)
   379      log.info("Reboot complete and all hosts ready for retest.")
   380  
   381      _assess_container_networking(client, types, hosts, containers)
   382      log.info("PASS")
   383  
   384  
   385  @contextlib.contextmanager
   386  def cleaned_bootstrap_context(bs_manager, args):
   387      client = bs_manager.client
   388      # TODO(gz): Having to manipulate client env state here to get the temp env
   389      #           is ugly, would ideally be captured in an explicit scope.
   390      update_env(client.env, bs_manager.temp_env_name, series=bs_manager.series,
   391                 agent_url=bs_manager.agent_url,
   392                 agent_stream=bs_manager.agent_stream, region=bs_manager.region)
   393      with bs_manager.top_context() as machines:
   394          with bs_manager.bootstrap_context(machines):
   395              client.bootstrap(args.upload_tools)
   396          with bs_manager.runtime_context(machines):
   397              yield
   398  
   399  
   400  def _get_container_types(client, machine_type):
   401      """
   402      Give list of container types to run testing against.
   403  
   404      If a machine_type was explicitly specified, only test against those kind
   405      of containers. Otherwise, test all possible containers for the given
   406      juju version.
   407      """
   408      if machine_type:
   409          if machine_type not in client.supported_container_types:
   410              raise Exception(
   411                  "no {} support on juju {}".format(machine_type,
   412                                                    client.version))
   413          return [machine_type]
   414      # TODO(gz): Only include LXC for 1.X clients
   415      types = list(client.supported_container_types)
   416      types.sort()
   417      return types
   418  
   419  
   420  def main(argv=None):
   421      args = parse_args(argv)
   422      configure_logging(args.verbose)
   423      bs_manager = BootstrapManager.from_args(args)
   424      client = bs_manager.client
   425      machine_types = _get_container_types(client, args.machine_type)
   426      with cleaned_bootstrap_context(bs_manager, args):
   427          assess_container_networking(bs_manager.client, machine_types,
   428                                      args.space)
   429      return 0
   430  
   431  
   432  if __name__ == '__main__':
   433      sys.exit(main())