github.com/niedbalski/juju@v0.0.0-20190215020005-8ff100488e47/acceptancetests/assess_container_networking.py (about)

     1  #!/usr/bin/env python
     2  from __future__ import print_function
     3  from argparse import ArgumentParser
     4  import contextlib
     5  from copy import (
     6      copy,
     7      deepcopy,
     8      )
     9  import logging
    10  import re
    11  import os
    12  import subprocess
    13  import sys
    14  import tempfile
    15  from textwrap import dedent
    16  import time
    17  
    18  from deploy_stack import (
    19      BootstrapManager,
    20      get_random_string,
    21      update_env,
    22      )
    23  from jujupy import (
    24      KVM_MACHINE,
    25      LXC_MACHINE,
    26      LXD_MACHINE,
    27      )
    28  from utility import (
    29      JujuAssertionError,
    30      add_basic_testing_arguments,
    31      configure_logging,
    32      wait_for_port,
    33      )
    34  
    35  
    36  __metaclass__ = type
    37  
    38  
    39  log = logging.getLogger("assess_container_networking")
    40  
    41  
    42  def parse_args(argv=None):
    43      """Parse all arguments."""
    44  
    45      description = dedent("""\
    46      Test container address allocation.
    47      For LXC and KVM, create machines of each type and test the network
    48      between LXC <--> LXC, KVM <--> KVM and LXC <--> KVM. Also test machine
    49      to outside world, DNS and that these tests still pass after a reboot. In
    50      case of failure pull logs and configuration files from the machine that
    51      we detected a problem on for later analysis.
    52      """)
    53      parser = add_basic_testing_arguments(
    54          ArgumentParser(description=description),
    55          existing=False)
    56      parser.add_argument(
    57          '--machine-type',
    58          help='Which virtual machine/container type to test. Defaults to all.',
    59          choices=[KVM_MACHINE, LXC_MACHINE, LXD_MACHINE])
    60      parser.add_argument(
    61          '--space-constraint',
    62          help='The network space to constrain containers to. Default is no space constraints.',
    63          default=None,
    64          dest='space')
    65      args = parser.parse_args(argv)
    66      return args
    67  
    68  
    69  def ssh(client, machine, cmd):
    70      """Convenience function: run a juju ssh command and get back the output
    71      :param client: A Juju client
    72      :param machine: ID of the machine on which to run a command
    73      :param cmd: the command to run
    74      :return: text output of the command
    75      """
    76      back_off = 2
    77      attempts = 4
    78      for attempt in range(attempts):
    79          try:
    80              return client.get_juju_output('ssh', '--proxy', machine, cmd)
    81          except subprocess.CalledProcessError as e:
    82              # If the connection to the host failed, try again in a couple of
    83              # seconds. This is usually due to heavy load.
    84              if(attempt < attempts - 1 and
    85                  re.search('ssh_exchange_identification: '
    86                            'Connection closed by remote host', e.stderr)):
    87                  time.sleep(back_off)
    88                  back_off *= 2
    89              else:
    90                  raise
    91  
    92  
    93  def make_machines(client, container_types, space):
    94      """Make a test environment consisting of:
    95         Two host machines.
    96         Two of each container_type on one host machine.
    97         One of each container_type on one host machine.
    98      :param client: A ModelClient
    99      :param container_types: list of containers to create
   100      :return: hosts (list), containers {container_type}{host}[containers]
   101      """
   102      # Find existing host machines
   103      old_hosts = client.get_status().status['machines']
   104      machines_to_add = 2 - len(old_hosts)
   105  
   106      # Allocate more hosts as needed
   107      if machines_to_add > 0:
   108          client.juju('add-machine', ('-n', str(machines_to_add)))
   109      status = client.wait_for_started()
   110      hosts = sorted(status.status['machines'].keys())[:2]
   111  
   112      # Find existing containers
   113      required = dict(zip(hosts, [copy(container_types) for h in hosts]))
   114      required[hosts[0]] += container_types
   115      for c in status.iter_machines(containers=True, machines=False):
   116          host, type, id = c[0].split('/')
   117          if type in required[host]:
   118              required[host].remove(type)
   119  
   120      # Start any new containers we need
   121      sargs = []
   122      if space:
   123          sargs = ['--constraints', 'spaces=' + space]
   124           
   125      for host, containers in required.iteritems():
   126          for container in containers:
   127              client.juju('add-machine', tuple(['{}:{}'.format(container, host)] + sargs))
   128  
   129      status = client.wait_for_started()
   130  
   131      # Build a list of containers, now they have all started
   132      tmp = dict(zip(hosts, [[] for h in hosts]))
   133      containers = dict(zip(container_types,
   134                            [deepcopy(tmp) for t in container_types]))
   135      for c in status.iter_machines(containers=True, machines=False):
   136          host, type, id = c[0].split('/')
   137          if type in containers and host in containers[type]:
   138              containers[type][host].append(c[0])
   139      return hosts, containers
   140  
   141  
   142  def find_network(client, machine, addr):
   143      """Find a connected subnet containing the given address.
   144  
   145      When using this to find the subnet of a container, don't use the container
   146      as the machine to run the ip route show command on ("machine"), use a real
   147      box because lxc will just send everything to its host machine, so it is on
   148      a subnet containing itself. Not much use.
   149      :param client: A Juju client
   150      :param machine: ID of the machine on which to run a command
   151      :param addr: find the connected subnet containing this address
   152      :return: CIDR containing the address if found, else, None
   153      """
   154      ip_cmd = ' '.join(['ip', 'route', 'show', 'to', 'match', addr])
   155      routes = ssh(client, machine, ip_cmd)
   156  
   157      for route in re.findall(r'^(\S+).*[\d\.]+/\d+', routes, re.MULTILINE):
   158          if route != 'default':
   159              return route
   160  
   161      raise ValueError("Unable to find route to %r" % addr)
   162  
   163  
   164  def assess_network_traffic(client, targets):
   165      """Test that all containers in target can talk to target[0]
   166      :param client: Juju client
   167      :param targets: machine IDs of machines to test
   168      :return: None;
   169      """
   170      status = client.wait_for_started().status
   171      log.info('Assessing network traffic.')
   172      source = targets[0]
   173      dests = targets[1:]
   174  
   175      with tempfile.NamedTemporaryFile(delete=False) as f:
   176          f.write('tmux new-session -d -s test "nc -l 6778 > nc_listen.out"')
   177      client.juju('scp', ('--proxy', f.name, source + ':/home/ubuntu/listen.sh'))
   178      os.remove(f.name)
   179  
   180      # Containers are named 'x/type/y' where x is the host of the container. We
   181      host = source.split('/')[0]
   182      address = status['machines'][host]['containers'][source]['dns-name']
   183  
   184      for dest in dests:
   185          log.info('Assessing network traffic for {}.'.format(dest))
   186          msg = get_random_string()
   187          ssh(client, source, 'rm nc_listen.out; bash ./listen.sh')
   188          ssh(client, dest,
   189              'echo "{msg}" | nc {addr} 6778'.format(msg=msg, addr=address))
   190          result = ssh(client, source, 'more nc_listen.out')
   191          if msg not in result:
   192              raise ValueError("Wrong or missing message: %r" % result.rstrip())
   193          log.info('SUCCESS.')
   194  
   195  
   196  def private_address(client, host):
   197      default_route = ssh(client, host, 'ip -4 -o route list 0/0')
   198      log.info("Default route from {}: {}".format(host, default_route))
   199      # Match the device that is the word after 'dev'. eg.
   200      # default via 10.0.30.1 dev br-eth1 onlink'
   201      route_match = re.search(r'\sdev\s([\w-]+)', default_route)
   202      if route_match is None:
   203          raise JujuAssertionError(
   204              "Failed to find device in {}".format(default_route))
   205      device = route_match.group(1)
   206      log.info("Fetching the device IP of {}".format(device))
   207      device_ip = ssh(client, host, 'ip -4 -o addr show {}'.format(device))
   208      log.info("Device IP for {}: {}".format(host, device_ip))
   209      ip_match = re.search(r'inet\s+(\S+)/\d+\s', device_ip)
   210      if ip_match is None:
   211          raise JujuAssertionError(
   212              "Failed to find ip for device: {}".format(device))
   213      return ip_match.group(1)
   214  
   215  
   216  def assess_address_range(client, targets):
   217      """Test that two containers are in the same subnet as their host
   218      :param client: Juju client
   219      :param targets: machine IDs of machines to test
   220      :return: None; raises ValueError on failure
   221      """
   222      log.info('Assessing address range.')
   223      status = client.wait_for_started().status
   224  
   225      host_subnet_cache = {}
   226  
   227      for target in targets:
   228          log.info('Assessing address range for {}.'.format(target))
   229          host = target.split('/')[0]
   230  
   231          if host in host_subnet_cache:
   232              host_subnet = host_subnet_cache[host]
   233          else:
   234              host_address = private_address(client, host)
   235              host_subnet = find_network(client, host, host_address)
   236              host_subnet_cache[host] = host_subnet
   237  
   238          addr = status['machines'][host]['containers'][target]['dns-name']
   239          subnet = find_network(client, host, addr)
   240          if host_subnet != subnet:
   241              raise ValueError(
   242                  '{} ({}) not on the same subnet as {} ({})'.format(
   243                      target, subnet, host, host_subnet))
   244          log.info('SUCCESS.')
   245  
   246  
   247  def assess_internet_connection(client, targets):
   248      """Test that targets can ping their default route
   249      :param client: Juju client
   250      :param targets: machine IDs of machines to test
   251      :return: None; raises ValueError on failure
   252      """
   253      log.info('Assessing internet connection.')
   254      for target in targets:
   255          log.info("Assessing internet connection for {}".format(target))
   256          routes = ssh(client, target, 'ip route show')
   257  
   258          d = re.search(r'^default\s+via\s+([\d\.]+)\s+', routes, re.MULTILINE)
   259          if d:
   260              rc, _ = client.juju(
   261                  'ssh',
   262                  ('--proxy', target, 'ping -c1 -q ' + d.group(1)), check=False)
   263              if rc != 0:
   264                  raise ValueError('%s unable to ping default route' % target)
   265          else:
   266              raise ValueError("Default route not found")
   267          log.info("SUCCESS")
   268  
   269  
   270  def _assessment_iteration(client, containers):
   271      """Run the network tests on this collection of machines and containers
   272      :param client: Juju client
   273      :param hosts: list of hosts of containers
   274      :param containers: list of containers to run tests between
   275      :return: None
   276      """
   277      assess_internet_connection(client, containers)
   278      assess_address_range(client, containers)
   279      assess_network_traffic(client, containers)
   280  
   281  
   282  def _assess_container_networking(client, types, hosts, containers):
   283      """Run _assessment_iteration on all useful combinations of containers
   284      :param client: Juju client
   285      :param args: Parsed command line arguments
   286      :return: None
   287      """
   288      for container_type in types:
   289          # Test with two containers on the same host
   290          _assessment_iteration(client, containers[container_type][hosts[0]])
   291  
   292          # Now test with two containers on two different hosts
   293          test_containers = [
   294              containers[container_type][hosts[0]][0],
   295              containers[container_type][hosts[1]][0],
   296          ]
   297          _assessment_iteration(client, test_containers)
   298  
   299      if KVM_MACHINE in types and LXC_MACHINE in types:
   300          test_containers = [
   301              containers[LXC_MACHINE][hosts[0]][0],
   302              containers[KVM_MACHINE][hosts[0]][0],
   303          ]
   304          _assessment_iteration(client, test_containers)
   305  
   306          # Test with an LXC and a KVM on different machines
   307          test_containers = [
   308              containers[LXC_MACHINE][hosts[0]][0],
   309              containers[KVM_MACHINE][hosts[1]][0],
   310          ]
   311          _assessment_iteration(client, test_containers)
   312  
   313  
   314  def get_uptime(client, host):
   315      uptime_pattern = re.compile(r'.*(\d+)')
   316      uptime_output = ssh(client, host, 'uptime -p')
   317      log.info('uptime -p: {}'.format(uptime_output))
   318      match = uptime_pattern.match(uptime_output)
   319      if match:
   320          return int(match.group(1))
   321      else:
   322          return 0
   323  
   324  
   325  def assess_container_networking(client, types, space):
   326      """Runs _assess_address_allocation, reboots hosts, repeat.
   327  
   328      :param client: Juju client
   329      :param types: Container types to test
   330      :return: None
   331      """
   332      log.info("Setting up test.")
   333      hosts, containers = make_machines(client, types, space)
   334      status = client.wait_for_started().status
   335      log.info("Setup complete.")
   336      log.info("Test started.")
   337  
   338      _assess_container_networking(client, types, hosts, containers)
   339  
   340      # Reboot all hosted modelled machines then the controller.
   341      log.info("Instrumenting reboot of all machines.")
   342      try:
   343          for host in hosts:
   344              log.info("Restarting hosted machine: {}".format(host))
   345              client.juju(
   346                  'run', ('--machine', host, 'sudo shutdown -r +1'))
   347          client.juju('show-action-status', ('--name', 'juju-run'))
   348  
   349          log.info("Restarting controller machine 0")
   350          controller_client = client.get_controller_client()
   351          controller_status = controller_client.get_status()
   352          controller_host = controller_status.status['machines']['0']['dns-name']
   353          first_uptime = get_uptime(controller_client, '0')
   354          ssh(controller_client, '0', 'sudo shutdown -r +1')
   355          # Ensure the reboots have started.
   356          time.sleep(70)
   357      except subprocess.CalledProcessError as e:
   358          logging.info(
   359              "Error running shutdown:\nstdout: %s\nstderr: %s",
   360              e.output, getattr(e, 'stderr', None))
   361          raise
   362  
   363      # Wait for the controller to shut down if it has not yet restarted.
   364      # This ensure the call to wait_for_started happens after each host
   365      # has restarted.
   366      second_uptime = get_uptime(controller_client, '0')
   367      if second_uptime > first_uptime:
   368          wait_for_port(controller_host, 22, closed=True, timeout=300)
   369      client.wait_for_started()
   370  
   371      # Once Juju is up it can take a little while before ssh responds.
   372      for host in hosts:
   373          hostname = status['machines'][host]['dns-name']
   374          wait_for_port(hostname, 22, timeout=240)
   375      log.info("Reboot complete and all hosts ready for retest.")
   376  
   377      _assess_container_networking(client, types, hosts, containers)
   378      log.info("PASS")
   379  
   380  
   381  @contextlib.contextmanager
   382  def cleaned_bootstrap_context(bs_manager, args):
   383      client = bs_manager.client
   384      # TODO(gz): Having to manipulate client env state here to get the temp env
   385      #           is ugly, would ideally be captured in an explicit scope.
   386      update_env(client.env, bs_manager.temp_env_name, series=bs_manager.series,
   387                 agent_url=bs_manager.agent_url,
   388                 agent_stream=bs_manager.agent_stream, region=bs_manager.region)
   389      with bs_manager.top_context() as machines:
   390          with bs_manager.bootstrap_context(machines):
   391              client.bootstrap(args.upload_tools)
   392          with bs_manager.runtime_context(machines):
   393              yield
   394  
   395  
   396  def _get_container_types(client, machine_type):
   397      """
   398      Give list of container types to run testing against.
   399  
   400      If a machine_type was explicitly specified, only test against those kind
   401      of containers. Otherwise, test all possible containers for the given
   402      juju version.
   403      """
   404      if machine_type:
   405          if machine_type not in client.supported_container_types:
   406              raise Exception(
   407                  "no {} support on juju {}".format(machine_type,
   408                                                    client.version))
   409          return [machine_type]
   410      # TODO(gz): Only include LXC for 1.X clients
   411      types = list(client.supported_container_types)
   412      types.sort()
   413      return types
   414  
   415  
   416  def main(argv=None):
   417      args = parse_args(argv)
   418      configure_logging(args.verbose)
   419      bs_manager = BootstrapManager.from_args(args)
   420      client = bs_manager.client
   421      machine_types = _get_container_types(client, args.machine_type)
   422      with cleaned_bootstrap_context(bs_manager, args):
   423          assess_container_networking(bs_manager.client, machine_types, args.space)
   424      return 0
   425  
   426  
   427  if __name__ == '__main__':
   428      sys.exit(main())