github.com/niedbalski/juju@v0.0.0-20190215020005-8ff100488e47/acceptancetests/assess_network_health.py

github.com/niedbalski/juju@v0.0.0-20190215020005-8ff100488e47/acceptancetests/assess_network_health.py (about)

     1  #!/usr/bin/env python
     2  """Assess network health for a given deployment or bundle"""
     3  from __future__ import print_function
     4  
     5  import argparse
     6  import logging
     7  import sys
     8  import json
     9  import yaml
    10  import subprocess
    11  import re
    12  import time
    13  import os
    14  import socket
    15  from collections import defaultdict
    16  
    17  from jujupy import (
    18      client_for_existing
    19      )
    20  from jujupy.wait_condition import (
    21      WaitApplicationNotPresent
    22      )
    23  from deploy_stack import (
    24      BootstrapManager
    25      )
    26  from utility import (
    27      add_basic_testing_arguments,
    28      generate_default_clean_dir,
    29      configure_logging,
    30      wait_for_port
    31      )
    32  from substrate import (
    33      maas_account_from_boot_config,
    34      )
    35  
    36  __metaclass__ = type
    37  
    38  log = logging.getLogger("assess_network_health")
    39  
    40  NO_EXPOSED_UNITS = 'No exposed units'
    41  
    42  PORT = 8039
    43  
    44  
    45  class AssessNetworkHealth:
    46  
    47      def __init__(self, args):
    48          if args.logs:
    49              self.log_dir = args.logs
    50          else:
    51              self.log_dir = generate_default_clean_dir(
    52                              args.temp_env_name)
    53          self.expose_client = None
    54          self.existing_series = set([])
    55          self.expose_test_charms = set([])
    56  
    57      def assess_network_health(self, client, bundle=None, target_model=None,
    58                                reboot=False, series=None, maas=None):
    59          """Assesses network health for a given deployment or bundle.
    60  
    61          :param client: The juju client in use
    62          :param bundle: Optional bundle to test on
    63          :param target_model: Optional existing model to test under
    64          :param reboot: Reboot and re-run tests
    65          :param series: Ubuntu series to deploy
    66          :param maas: MaaS manager object
    67          """
    68          if maas:
    69              setup_spaces(maas, bundle)
    70          self.setup_testing_environment(client, bundle, target_model, series)
    71          log.info('Starting network tests.')
    72          results_pre = self.testing_iterations(client, series, target_model)
    73          error_string = ['Initial test failures:']
    74          if not reboot:
    75              if results_pre:
    76                  error_string.extend(results_pre)
    77                  raise Exception('\n'.join(error_string))
    78              log.info('SUCCESS')
    79              return
    80          log.info('Units completed pre-reboot tests, rebooting machines.')
    81          self.reboot_machines(client)
    82          results_post = self.testing_iterations(client, series, target_model,
    83                                                 reboot_msg='Post-reboot ')
    84          if results_pre or results_post:
    85              error_string.extend(results_pre or 'No pre-reboot failures.')
    86              error_string.extend(['Post-reboot test failures:'])
    87              error_string.extend(results_post or 'No post-reboot failures.')
    88              raise Exception('\n'.join(error_string))
    89          log.info('SUCCESS')
    90          return
    91  
    92      def testing_iterations(self, client, series, target_model, reboot_msg=''):
    93          """Runs through each test given for a given client and series
    94  
    95          :param client: Client
    96          """
    97          interface_info = self.get_unit_info(client)
    98          log.info('{0}Interface information:\n{1}'.format(
    99              reboot_msg, json.dumps(interface_info, indent=4, sort_keys=True)))
   100          int_result = self.internet_connection(client)
   101          log.info('{0}Internet Test '
   102                   'result:\n {1}'.format(reboot_msg,
   103                                          json.dumps(int_result, indent=4,
   104                                                     sort_keys=True)))
   105          vis_result = self.neighbor_visibility(client)
   106          log.info('{0}Visibility '
   107                   'result:\n {1}'.format(reboot_msg,
   108                                          json.dumps(vis_result,
   109                                                     indent=4,
   110                                                     sort_keys=True)))
   111  
   112          exp_result = self.ensure_exposed(client, series)
   113          log.info('{0}Exposure '
   114                   'result:\n {1}'.format(reboot_msg,
   115                                          json.dumps(exp_result,
   116                                                     indent=4,
   117                                                     sort_keys=True)) or
   118                   NO_EXPOSED_UNITS)
   119          log.info('Tests complete.')
   120          return self.parse_final_results(vis_result, int_result,
   121                                          exp_result)
   122  
   123      def setup_testing_environment(self, client, bundle, target_model,
   124                                    series=None):
   125          """Sets up the testing environment given an option bundle and/or model.
   126  
   127          :param client: The juju client in use
   128          :param bundle: Optional bundle to test on or None
   129          :param model: Optional existing model to test under
   130          """
   131          log.info("Setting up test environment.")
   132          if target_model:
   133              self.connect_to_existing_model(client, target_model)
   134          if bundle:
   135              self.setup_bundle_deployment(client, bundle)
   136          elif bundle is None and target_model is None:
   137              self.setup_dummy_deployment(client, series)
   138          apps = client.get_status().get_applications()
   139          for _, info in apps.items():
   140              self.existing_series.add(info['series'])
   141          for series in self.existing_series:
   142              try:
   143                  client.deploy('~juju-qa/network-health', series=series,
   144                                alias='network-health-{}'.format(series))
   145  
   146              except subprocess.CalledProcessError:
   147                  log.info('Could not deploy network-health-{} as it is already'
   148                           ' present in the juju deployment.'.format(series))
   149          client.wait_for_started()
   150          client.wait_for_workloads()
   151          for series in self.existing_series:
   152              client.juju('expose', ('network-health-{}'.format(series)))
   153          apps = client.get_status().get_applications()
   154          log.info('Known applications: {}'.format(apps.keys()))
   155          for app, info in apps.items():
   156              app_series = info['series']
   157              try:
   158                  client.juju('add-relation',
   159                              (app, 'network-health-{}'.format(app_series)))
   160              except subprocess.CalledProcessError as e:
   161                  log.error('Could not relate {0} & network-health due '
   162                            'to error: {1}'.format(app, e))
   163          client.wait_for_workloads()
   164          for app, info in apps.items():
   165              app_series = info['series']
   166              client.wait_for_subordinate_units(
   167                  app, 'network-health-{}'.format(app_series))
   168  
   169      def connect_to_existing_model(self, client, target_model):
   170          """Connects to an existing Juju model.
   171  
   172          :param client: Juju client object without bootstrapped controller
   173          :param target_model: Model to connect to for testing
   174          """
   175          log.info("Connecting to existing model: {}".format(target_model))
   176          if client.show_model().keys()[0] is not target_model:
   177              client.switch(target_model)
   178  
   179      def setup_dummy_deployment(self, client, series):
   180          """Sets up a dummy test environment with 2 ubuntu charms.
   181  
   182          :param client: Bootstrapped juju client
   183          """
   184          log.info("Deploying dummy charm for basic testing.")
   185          client.deploy('ubuntu', num=2, series=series)
   186          client.juju('expose', ('ubuntu',))
   187          client.wait_for_started()
   188          client.wait_for_workloads()
   189  
   190      def setup_bundle_deployment(self, client, bundle):
   191          """Deploys a test environment with supplied bundle.
   192  
   193          :param bundle: Path to a bundle
   194          """
   195          log.info("Deploying bundle specified at {}".format(bundle))
   196          client.deploy_bundle(bundle)
   197          client.wait_for_started()
   198          client.wait_for_workloads()
   199  
   200      def cleanup(self, client):
   201          log.info('Cleaning up deployed test charms and models.')
   202          if self.expose_test_charms:
   203              for charm in self.expose_test_charms:
   204                  client.remove_service(charm)
   205              return
   206          for series in self.existing_series:
   207              client.remove_service('network-health-{}'.format(series))
   208  
   209      def get_unit_info(self, client):
   210          """Gets the machine or container interface info.
   211  
   212          :param client: Client to get results from
   213          :return: Dict of machine results as
   214          <machine>:{'interfaces':<interfaces>}
   215          """
   216          results = {}
   217          apps = client.get_status().get_applications()
   218          nh_units = self.get_nh_unit_info(apps, by_unit=True)
   219          for app, units in nh_units.items():
   220              machine = apps[app.split('/')[0]]['units'][app]['machine']
   221              results[machine] = defaultdict(defaultdict)
   222              results[machine]['interfaces'] = {}
   223              for nh_unit in units.keys():
   224                  out = client.action_do(nh_unit, 'unit-info')
   225                  out = client.action_fetch(out)
   226                  out = yaml.safe_load(out)
   227                  interfaces = out['results']['interfaces']
   228                  results[machine]['interfaces'][nh_unit] = interfaces
   229          return results
   230  
   231      def internet_connection(self, client):
   232          """Test that targets can ping their default route.
   233  
   234          :param client: Juju client
   235          :return: Dict of results by machine
   236          """
   237          log.info('Assessing internet connection.')
   238          results = {}
   239          units = client.get_status().iter_machines(containers=True)
   240          for unit in units:
   241              log.info("Assessing internet connection for "
   242                       "machine: {}".format(unit[0]))
   243              results[unit[0]] = False
   244              try:
   245                  routes = client.run(['ip route show'], machines=[unit[0]])
   246              except subprocess.CalledProcessError:
   247                  log.error('Could not connect to address for unit: {0}, '
   248                            'unable to find default route.'.format(unit[0]))
   249                  continue
   250              default_route = re.search(r'(default via )+([\d\.]+)\s+',
   251                                        json.dumps(routes[0]))
   252              if default_route:
   253                  results[unit[0]] = True
   254              else:
   255                  log.error("Default route not found for {}".format(unit[0]))
   256                  continue
   257          return results
   258  
   259      def get_nh_unit_info(self, apps, by_unit=False):
   260          """Parses juju status information to return deployed network-health units.
   261  
   262          :param apps: Dict of apps given by get_status().get_applications()
   263          :param by_unit: Bool, returns dict of NH units keyed by the unit they
   264          are subordinate to
   265          :return: Dict of network-health units
   266          """
   267          nh_units = {}
   268          nh_by_unit = {}
   269          for app, units in apps.items():
   270              for unit, info in units.get('units', {}).items():
   271                  nh_by_unit[unit] = {}
   272                  for sub, sub_info in info.get('subordinates', {}).items():
   273                      if 'network-health' in sub:
   274                          nh_by_unit[unit][sub] = sub_info
   275                          nh_units[sub] = sub_info
   276          if by_unit:
   277              return nh_by_unit
   278          return nh_units
   279  
   280      def neighbor_visibility(self, client):
   281          """Check if each application's units are visible, including our own.
   282  
   283          :param client: The juju client in use
   284          """
   285          log.info('Starting neighbor visibility test')
   286          apps = client.get_status().get_applications()
   287          nh_units = self.get_nh_unit_info(apps)
   288          target_ips = [ip['public-address'] for ip in nh_units.values()]
   289          result = {}
   290          for app, units in apps.items():
   291              result[app] = defaultdict(defaultdict)
   292              for unit, info in units.get('units', {}).items():
   293                  for ip in target_ips:
   294                      result[app][unit][ip] = False
   295                      pattern = r"(pass)"
   296                      log.info('Attempting to contact {}:{} '
   297                               'from {}'.format(ip, PORT, unit))
   298                      out = client.run(['curl {}:{}'.format(ip, PORT)],
   299                                       units=[unit])
   300                      match = re.search(pattern, json.dumps(out[0]))
   301                      if match:
   302                          log.info('pass')
   303                          result[app][unit][ip] = True
   304          return result
   305  
   306      def ensure_exposed(self, client, series):
   307          """Ensure exposed applications are visible from the outside.
   308  
   309          :param client: The juju client in use
   310          :return: Exposure test results in dict by pass/fail
   311          """
   312          log.info('Starting test of exposed units.')
   313  
   314          apps = client.get_status().get_applications()
   315          exposed = [app for app, e in apps.items() if e.get('exposed')
   316                     is True and 'network-health' not in app]
   317          if len(exposed) is 0:
   318              nh_only = True
   319              log.info('No exposed units, testing with network-health '
   320                       'charms only.')
   321              for series in self.existing_series:
   322                  exposed.append('network-health-{}'.format(series))
   323          else:
   324              nh_only = False
   325              self.setup_expose_test(client, series, exposed)
   326  
   327          service_results = {}
   328          for unit, info in client.get_status().iter_units():
   329              ip = info['public-address']
   330              if nh_only and 'network-health' in unit:
   331                  service_results[unit] = self.curl(ip)
   332              elif not nh_only and 'network-health' not in unit:
   333                  service_results[unit] = self.curl(ip)
   334          log.info(service_results)
   335          return self.parse_expose_results(service_results, exposed)
   336  
   337      def curl(self, ip):
   338          log.info('Attempting to curl unit at {}:{}'.format(ip, PORT))
   339          try:
   340              out = subprocess.check_output(
   341                  'curl {}:{} -m 5'.format(ip, PORT), shell=True)
   342          except subprocess.CalledProcessError as e:
   343              out = ''
   344              log.warning('Curl failed for error:\n{}'.format(e))
   345          log.info('Got: "{}" from unit at {}:{}'.format(out, ip, PORT))
   346          if 'pass' in out:
   347              return True
   348          return False
   349  
   350      def setup_expose_test(self, client, series, exposed):
   351          """Sets up the expose test using aliased NH charms.
   352  
   353          :param client: juju client object used in the test.
   354          :param series: Charm series
   355          :param exposed: List of exposed charms
   356          """
   357  
   358          log.info('Removing previous network-health charms')
   359  
   360          """
   361          This is done to work with the behavior used in other network-health
   362          tests to circumvent Juju's lack of support for multi-series charms.
   363          If a multi-series subordinate is deployed under one of its available
   364          series, then a second copy of that charm in a different series cannot
   365          also be deployed. Subsequently, when we deploy the NH charms for the
   366          above tests, the series is appended to the end of the charm. In order
   367          for the expose test to work properly the NH charm has to be exposed,
   368          which in Juju means all of the NH charms under that alias or none.
   369          So if there are existing exposed units, the test redeploys an aliased
   370          NH charm under each so that it can expose them individually, ensuring
   371          valid test results.
   372          On the subject of speed, since the deps in network-health's wheelhouse
   373          have already been built on the target machine or container, this is a
   374          relatively fast process at ~30 seconds for large(6+ charm) deployments.
   375          """
   376          for series in self.existing_series:
   377              alias = 'network-health-{}'.format(series)
   378              client.remove_service(alias)
   379          for series in self.existing_series:
   380              alias = 'network-health-{}'.format(series)
   381              client.wait_for(WaitApplicationNotPresent(alias))
   382          log.info('Deploying aliased network-health charms')
   383          apps = client.get_status().get_applications()
   384          for app, info in apps.items():
   385              if 'network-health' not in app:
   386                  alias = 'network-health-{}'.format(app)
   387                  client.deploy('~juju-qa/network-health', alias=alias,
   388                                series=info['series'])
   389                  try:
   390                      client.juju('add-relation', (app, alias))
   391                      self.expose_test_charms.add(alias)
   392                  except subprocess.CalledProcessError as e:
   393                      log.warning('Could not relate {}, {} due to '
   394                                  'error:\n{}'.format(app, alias, e))
   395          for app in apps.keys():
   396              if 'network-health' not in app:
   397                  client.wait_for_subordinate_units(
   398                      app, 'network-health-{}'.format(app))
   399          for app in exposed:
   400              client.juju('expose', ('network-health-{}'.format(app)))
   401  
   402      def parse_expose_results(self, service_results, exposed):
   403          """Parses expose test results into dict of pass/fail.
   404  
   405          :param service_results: Raw results from expose test
   406          :return: Parsed results dict
   407          """
   408          results = {'fail': (),
   409                     'pass': ()}
   410          for unit, result in service_results.items():
   411              app = unit.split('/')[0]
   412              if app in exposed and result:
   413                  results['pass'] += (unit,)
   414              elif app in exposed and not result:
   415                  results['fail'] += (unit,)
   416              elif app not in exposed and result:
   417                  results['fail'] += (unit,)
   418              else:
   419                  results['pass'] += (unit,)
   420          return results
   421  
   422      def parse_final_results(self, visibility, internet, exposed):
   423          """Parses test results and raises an error if any failed.
   424  
   425          :param visibility: Visibility test result
   426          :param exposed: Exposure test result
   427          """
   428          log.info('Parsing final results.')
   429          error_string = []
   430          for nh_source, service_result in visibility.items():
   431                  for service, unit_res in service_result.items():
   432                      if False in unit_res.values():
   433                          failed = [u for u, r in unit_res.items() if r is False]
   434                          error = ('Unit {0} failed to contact '
   435                                   'targets(s): {1}'.format(nh_source, failed))
   436                          error_string.append(error)
   437          for unit, res in internet.items():
   438              if not res:
   439                  error = 'Machine {} failed internet connection.'.format(unit)
   440                  error_string.append(error)
   441          if exposed and exposed['fail'] is not ():
   442              error = ('Application(s) {0} failed expose '
   443                       'test'.format(exposed['fail']))
   444              error_string.append(error)
   445          return error_string
   446  
   447      def reboot_machines(self, client):
   448          log.info("Starting reboot of all containers.")
   449          try:
   450              for machine, m_info in client.get_status().iter_machines():
   451                  cont_ids = []
   452                  try:
   453                      cont_ids.extend([c['instance-id'] for c in
   454                                      m_info.get('containers').values()])
   455                  except KeyError:
   456                      log.info('No containers for machine: {}'.format(machine))
   457                  if cont_ids:
   458                      log.info('Restarting containers: {0} on '
   459                               'machine: {1}'.format(cont_ids, machine))
   460                      self.ssh(client, machine,
   461                               'sudo lxc restart {}'.format(' '.join(cont_ids)))
   462                  log.info("Restarting machine: {}".format(machine))
   463                  client.juju('run', ('--machine', machine,
   464                                      'sudo shutdown -r now'))
   465                  hostname = client.get_status().get_machine_dns_name(machine)
   466                  wait_for_port(hostname, 22, timeout=240)
   467  
   468          except subprocess.CalledProcessError as e:
   469              logging.info(
   470                  "Error running shutdown:\nstdout: {}\nstderr: {}".format(
   471                      e.output, getattr(e, 'stderr', None)))
   472          client.wait_for_started()
   473  
   474      def ssh(self, client, machine, cmd):
   475          """Convenience function: run a juju ssh command and get back the output
   476          :param client: A Juju client
   477          :param machine: ID of the machine on which to run a command
   478          :param cmd: the command to run
   479          :return: text output of the command
   480          """
   481          back_off = 2
   482          attempts = 4
   483          for attempt in range(attempts):
   484              try:
   485                  return client.get_juju_output('ssh', '--proxy', machine,
   486                                                cmd)
   487              except subprocess.CalledProcessError as e:
   488                  # If the connection to the host failed, try again in a couple
   489                  # of seconds. This is usually due to heavy load.
   490                  if(attempt < attempts - 1 and
   491                      re.search('ssh_exchange_identification: '
   492                                'Connection closed by remote host', e.stderr)):
   493                      time.sleep(back_off)
   494                      back_off *= 2
   495                  else:
   496                      raise
   497  
   498      def is_ipv6(self, address):
   499          try:
   500              socket.inet_pton(socket.AF_INET6, address)
   501          except socket.error:
   502              return False
   503          return True
   504  
   505      def to_json(self, units):
   506          """Returns a formatted json string to be passed through juju run-action.
   507  
   508          :param units: Dict of units
   509          :return: A "JSON-like" string that can be passed to Juju without it
   510          puking
   511          """
   512          json_string = json.dumps(units, separators=(',', '='))
   513          # Replace curly brackets so juju doesn't think it's JSON and puke
   514          json_string = json_string.replace('{', '(')
   515          json_string = json_string.replace('}', ')')
   516          return json_string
   517  
   518  
   519  def setup_spaces(maas, bundle=None):
   520      """Setup MaaS spaces to test charm bindings.
   521  
   522      Reads from the bundle file and pulls out the required spaces,
   523      then adds those spaces to the MaaS cluster using our MaaS
   524      controller wrapper.
   525  
   526      :param maas: MaaS manager object
   527      :param bundle: Bundle supplied in test
   528      """
   529      if not bundle:
   530          log.info('No bundle specified, skipping MaaS space assurance')
   531          return
   532      with open(bundle) as f:
   533          data = f.read()
   534          bundle_yaml = yaml.load(data)
   535      existing_spaces = maas.spaces()
   536      new_spaces = _setup_spaces(bundle_yaml, existing_spaces)
   537      for space in new_spaces:
   538          maas.create_space(space)
   539          log.info("Created space: {}".format(space))
   540  
   541  
   542  def _setup_spaces(bundle, existing_spaces):
   543      log.info("Have spaces: {}".format(
   544          ", ".join(s["name"] for s in existing_spaces)))
   545      spaces_map = dict((s["name"], s) for s in existing_spaces)
   546      required_spaces = {}
   547      log.info('Getting spaces from bundle: {}'.format(bundle))
   548  
   549      for info in bundle['services'].values():
   550          for binding, space in info.get('bindings').items():
   551              required_spaces[binding] = space
   552      new_spaces = []
   553      for space_name in required_spaces.values():
   554          space = spaces_map.get(space_name)
   555          if not space:
   556              new_spaces.append(space_name)
   557      return new_spaces
   558  
   559  
   560  def parse_args(argv):
   561      """Parse all arguments."""
   562      parser = argparse.ArgumentParser(description="Test Network Health")
   563      add_basic_testing_arguments(parser, existing=False)
   564      parser.add_argument('--bundle', help='Bundle to test network against')
   565      parser.add_argument('--model', help='Existing Juju model to test against')
   566      parser.add_argument('--reboot', type=bool,
   567                          help='Reboot machines and re-run tests, default=False')
   568      parser.add_argument('--maas', type=bool,
   569                          help='Test under maas')
   570      parser.set_defaults(maas=False)
   571      parser.set_defaults(reboot=False)
   572      parser.set_defaults(series='bionic')
   573      return parser.parse_args(argv)
   574  
   575  
   576  def start_test(client, args, maas):
   577      test = AssessNetworkHealth(args)
   578      try:
   579          test.assess_network_health(client, args.bundle, args.model,
   580                                     args.reboot, args.series, maas)
   581      finally:
   582          if args.model:
   583              test.cleanup(client)
   584              log.info('Cleanup complete.')
   585  
   586  
   587  def start_maas_test(client, args):
   588      try:
   589          with maas_account_from_boot_config(client.env) as manager:
   590              start_test(client, args, manager)
   591      except subprocess.CalledProcessError as e:
   592          log.warning(
   593              'Could not connect to MaaS controller due to error:\n{}'.format(e))
   594          log.warning('Attempting test without ensuring MaaS spaces.')
   595          start_test(client, args, None)
   596  
   597  
   598  def main(argv=None):
   599      args = parse_args(argv)
   600      configure_logging(args.verbose)
   601      if args.model:
   602          client = client_for_existing(args.juju_bin,
   603                                       os.environ['JUJU_HOME'])
   604          start_test(client, args, None)
   605      else:
   606          bs_manager = BootstrapManager.from_args(args)
   607          if args.maas:
   608              bs_manager.client.excluded_spaces = set()
   609              bs_manager.client.reserved_spaces = set()
   610          with bs_manager.booted_context(args.upload_tools):
   611              if args.maas:
   612                  start_maas_test(bs_manager.client, args)
   613              else:
   614                  start_test(bs_manager.client, args, None)
   615      return 0
   616  
   617  
   618  if __name__ == '__main__':
   619      sys.exit(main())