github.com/niedbalski/juju@v0.0.0-20190215020005-8ff100488e47/acceptancetests/assess_network_health.py (about) 1 #!/usr/bin/env python 2 """Assess network health for a given deployment or bundle""" 3 from __future__ import print_function 4 5 import argparse 6 import logging 7 import sys 8 import json 9 import yaml 10 import subprocess 11 import re 12 import time 13 import os 14 import socket 15 from collections import defaultdict 16 17 from jujupy import ( 18 client_for_existing 19 ) 20 from jujupy.wait_condition import ( 21 WaitApplicationNotPresent 22 ) 23 from deploy_stack import ( 24 BootstrapManager 25 ) 26 from utility import ( 27 add_basic_testing_arguments, 28 generate_default_clean_dir, 29 configure_logging, 30 wait_for_port 31 ) 32 from substrate import ( 33 maas_account_from_boot_config, 34 ) 35 36 __metaclass__ = type 37 38 log = logging.getLogger("assess_network_health") 39 40 NO_EXPOSED_UNITS = 'No exposed units' 41 42 PORT = 8039 43 44 45 class AssessNetworkHealth: 46 47 def __init__(self, args): 48 if args.logs: 49 self.log_dir = args.logs 50 else: 51 self.log_dir = generate_default_clean_dir( 52 args.temp_env_name) 53 self.expose_client = None 54 self.existing_series = set([]) 55 self.expose_test_charms = set([]) 56 57 def assess_network_health(self, client, bundle=None, target_model=None, 58 reboot=False, series=None, maas=None): 59 """Assesses network health for a given deployment or bundle. 60 61 :param client: The juju client in use 62 :param bundle: Optional bundle to test on 63 :param target_model: Optional existing model to test under 64 :param reboot: Reboot and re-run tests 65 :param series: Ubuntu series to deploy 66 :param maas: MaaS manager object 67 """ 68 if maas: 69 setup_spaces(maas, bundle) 70 self.setup_testing_environment(client, bundle, target_model, series) 71 log.info('Starting network tests.') 72 results_pre = self.testing_iterations(client, series, target_model) 73 error_string = ['Initial test failures:'] 74 if not reboot: 75 if results_pre: 76 error_string.extend(results_pre) 77 raise Exception('\n'.join(error_string)) 78 log.info('SUCCESS') 79 return 80 log.info('Units completed pre-reboot tests, rebooting machines.') 81 self.reboot_machines(client) 82 results_post = self.testing_iterations(client, series, target_model, 83 reboot_msg='Post-reboot ') 84 if results_pre or results_post: 85 error_string.extend(results_pre or 'No pre-reboot failures.') 86 error_string.extend(['Post-reboot test failures:']) 87 error_string.extend(results_post or 'No post-reboot failures.') 88 raise Exception('\n'.join(error_string)) 89 log.info('SUCCESS') 90 return 91 92 def testing_iterations(self, client, series, target_model, reboot_msg=''): 93 """Runs through each test given for a given client and series 94 95 :param client: Client 96 """ 97 interface_info = self.get_unit_info(client) 98 log.info('{0}Interface information:\n{1}'.format( 99 reboot_msg, json.dumps(interface_info, indent=4, sort_keys=True))) 100 int_result = self.internet_connection(client) 101 log.info('{0}Internet Test ' 102 'result:\n {1}'.format(reboot_msg, 103 json.dumps(int_result, indent=4, 104 sort_keys=True))) 105 vis_result = self.neighbor_visibility(client) 106 log.info('{0}Visibility ' 107 'result:\n {1}'.format(reboot_msg, 108 json.dumps(vis_result, 109 indent=4, 110 sort_keys=True))) 111 112 exp_result = self.ensure_exposed(client, series) 113 log.info('{0}Exposure ' 114 'result:\n {1}'.format(reboot_msg, 115 json.dumps(exp_result, 116 indent=4, 117 sort_keys=True)) or 118 NO_EXPOSED_UNITS) 119 log.info('Tests complete.') 120 return self.parse_final_results(vis_result, int_result, 121 exp_result) 122 123 def setup_testing_environment(self, client, bundle, target_model, 124 series=None): 125 """Sets up the testing environment given an option bundle and/or model. 126 127 :param client: The juju client in use 128 :param bundle: Optional bundle to test on or None 129 :param model: Optional existing model to test under 130 """ 131 log.info("Setting up test environment.") 132 if target_model: 133 self.connect_to_existing_model(client, target_model) 134 if bundle: 135 self.setup_bundle_deployment(client, bundle) 136 elif bundle is None and target_model is None: 137 self.setup_dummy_deployment(client, series) 138 apps = client.get_status().get_applications() 139 for _, info in apps.items(): 140 self.existing_series.add(info['series']) 141 for series in self.existing_series: 142 try: 143 client.deploy('~juju-qa/network-health', series=series, 144 alias='network-health-{}'.format(series)) 145 146 except subprocess.CalledProcessError: 147 log.info('Could not deploy network-health-{} as it is already' 148 ' present in the juju deployment.'.format(series)) 149 client.wait_for_started() 150 client.wait_for_workloads() 151 for series in self.existing_series: 152 client.juju('expose', ('network-health-{}'.format(series))) 153 apps = client.get_status().get_applications() 154 log.info('Known applications: {}'.format(apps.keys())) 155 for app, info in apps.items(): 156 app_series = info['series'] 157 try: 158 client.juju('add-relation', 159 (app, 'network-health-{}'.format(app_series))) 160 except subprocess.CalledProcessError as e: 161 log.error('Could not relate {0} & network-health due ' 162 'to error: {1}'.format(app, e)) 163 client.wait_for_workloads() 164 for app, info in apps.items(): 165 app_series = info['series'] 166 client.wait_for_subordinate_units( 167 app, 'network-health-{}'.format(app_series)) 168 169 def connect_to_existing_model(self, client, target_model): 170 """Connects to an existing Juju model. 171 172 :param client: Juju client object without bootstrapped controller 173 :param target_model: Model to connect to for testing 174 """ 175 log.info("Connecting to existing model: {}".format(target_model)) 176 if client.show_model().keys()[0] is not target_model: 177 client.switch(target_model) 178 179 def setup_dummy_deployment(self, client, series): 180 """Sets up a dummy test environment with 2 ubuntu charms. 181 182 :param client: Bootstrapped juju client 183 """ 184 log.info("Deploying dummy charm for basic testing.") 185 client.deploy('ubuntu', num=2, series=series) 186 client.juju('expose', ('ubuntu',)) 187 client.wait_for_started() 188 client.wait_for_workloads() 189 190 def setup_bundle_deployment(self, client, bundle): 191 """Deploys a test environment with supplied bundle. 192 193 :param bundle: Path to a bundle 194 """ 195 log.info("Deploying bundle specified at {}".format(bundle)) 196 client.deploy_bundle(bundle) 197 client.wait_for_started() 198 client.wait_for_workloads() 199 200 def cleanup(self, client): 201 log.info('Cleaning up deployed test charms and models.') 202 if self.expose_test_charms: 203 for charm in self.expose_test_charms: 204 client.remove_service(charm) 205 return 206 for series in self.existing_series: 207 client.remove_service('network-health-{}'.format(series)) 208 209 def get_unit_info(self, client): 210 """Gets the machine or container interface info. 211 212 :param client: Client to get results from 213 :return: Dict of machine results as 214 <machine>:{'interfaces':<interfaces>} 215 """ 216 results = {} 217 apps = client.get_status().get_applications() 218 nh_units = self.get_nh_unit_info(apps, by_unit=True) 219 for app, units in nh_units.items(): 220 machine = apps[app.split('/')[0]]['units'][app]['machine'] 221 results[machine] = defaultdict(defaultdict) 222 results[machine]['interfaces'] = {} 223 for nh_unit in units.keys(): 224 out = client.action_do(nh_unit, 'unit-info') 225 out = client.action_fetch(out) 226 out = yaml.safe_load(out) 227 interfaces = out['results']['interfaces'] 228 results[machine]['interfaces'][nh_unit] = interfaces 229 return results 230 231 def internet_connection(self, client): 232 """Test that targets can ping their default route. 233 234 :param client: Juju client 235 :return: Dict of results by machine 236 """ 237 log.info('Assessing internet connection.') 238 results = {} 239 units = client.get_status().iter_machines(containers=True) 240 for unit in units: 241 log.info("Assessing internet connection for " 242 "machine: {}".format(unit[0])) 243 results[unit[0]] = False 244 try: 245 routes = client.run(['ip route show'], machines=[unit[0]]) 246 except subprocess.CalledProcessError: 247 log.error('Could not connect to address for unit: {0}, ' 248 'unable to find default route.'.format(unit[0])) 249 continue 250 default_route = re.search(r'(default via )+([\d\.]+)\s+', 251 json.dumps(routes[0])) 252 if default_route: 253 results[unit[0]] = True 254 else: 255 log.error("Default route not found for {}".format(unit[0])) 256 continue 257 return results 258 259 def get_nh_unit_info(self, apps, by_unit=False): 260 """Parses juju status information to return deployed network-health units. 261 262 :param apps: Dict of apps given by get_status().get_applications() 263 :param by_unit: Bool, returns dict of NH units keyed by the unit they 264 are subordinate to 265 :return: Dict of network-health units 266 """ 267 nh_units = {} 268 nh_by_unit = {} 269 for app, units in apps.items(): 270 for unit, info in units.get('units', {}).items(): 271 nh_by_unit[unit] = {} 272 for sub, sub_info in info.get('subordinates', {}).items(): 273 if 'network-health' in sub: 274 nh_by_unit[unit][sub] = sub_info 275 nh_units[sub] = sub_info 276 if by_unit: 277 return nh_by_unit 278 return nh_units 279 280 def neighbor_visibility(self, client): 281 """Check if each application's units are visible, including our own. 282 283 :param client: The juju client in use 284 """ 285 log.info('Starting neighbor visibility test') 286 apps = client.get_status().get_applications() 287 nh_units = self.get_nh_unit_info(apps) 288 target_ips = [ip['public-address'] for ip in nh_units.values()] 289 result = {} 290 for app, units in apps.items(): 291 result[app] = defaultdict(defaultdict) 292 for unit, info in units.get('units', {}).items(): 293 for ip in target_ips: 294 result[app][unit][ip] = False 295 pattern = r"(pass)" 296 log.info('Attempting to contact {}:{} ' 297 'from {}'.format(ip, PORT, unit)) 298 out = client.run(['curl {}:{}'.format(ip, PORT)], 299 units=[unit]) 300 match = re.search(pattern, json.dumps(out[0])) 301 if match: 302 log.info('pass') 303 result[app][unit][ip] = True 304 return result 305 306 def ensure_exposed(self, client, series): 307 """Ensure exposed applications are visible from the outside. 308 309 :param client: The juju client in use 310 :return: Exposure test results in dict by pass/fail 311 """ 312 log.info('Starting test of exposed units.') 313 314 apps = client.get_status().get_applications() 315 exposed = [app for app, e in apps.items() if e.get('exposed') 316 is True and 'network-health' not in app] 317 if len(exposed) is 0: 318 nh_only = True 319 log.info('No exposed units, testing with network-health ' 320 'charms only.') 321 for series in self.existing_series: 322 exposed.append('network-health-{}'.format(series)) 323 else: 324 nh_only = False 325 self.setup_expose_test(client, series, exposed) 326 327 service_results = {} 328 for unit, info in client.get_status().iter_units(): 329 ip = info['public-address'] 330 if nh_only and 'network-health' in unit: 331 service_results[unit] = self.curl(ip) 332 elif not nh_only and 'network-health' not in unit: 333 service_results[unit] = self.curl(ip) 334 log.info(service_results) 335 return self.parse_expose_results(service_results, exposed) 336 337 def curl(self, ip): 338 log.info('Attempting to curl unit at {}:{}'.format(ip, PORT)) 339 try: 340 out = subprocess.check_output( 341 'curl {}:{} -m 5'.format(ip, PORT), shell=True) 342 except subprocess.CalledProcessError as e: 343 out = '' 344 log.warning('Curl failed for error:\n{}'.format(e)) 345 log.info('Got: "{}" from unit at {}:{}'.format(out, ip, PORT)) 346 if 'pass' in out: 347 return True 348 return False 349 350 def setup_expose_test(self, client, series, exposed): 351 """Sets up the expose test using aliased NH charms. 352 353 :param client: juju client object used in the test. 354 :param series: Charm series 355 :param exposed: List of exposed charms 356 """ 357 358 log.info('Removing previous network-health charms') 359 360 """ 361 This is done to work with the behavior used in other network-health 362 tests to circumvent Juju's lack of support for multi-series charms. 363 If a multi-series subordinate is deployed under one of its available 364 series, then a second copy of that charm in a different series cannot 365 also be deployed. Subsequently, when we deploy the NH charms for the 366 above tests, the series is appended to the end of the charm. In order 367 for the expose test to work properly the NH charm has to be exposed, 368 which in Juju means all of the NH charms under that alias or none. 369 So if there are existing exposed units, the test redeploys an aliased 370 NH charm under each so that it can expose them individually, ensuring 371 valid test results. 372 On the subject of speed, since the deps in network-health's wheelhouse 373 have already been built on the target machine or container, this is a 374 relatively fast process at ~30 seconds for large(6+ charm) deployments. 375 """ 376 for series in self.existing_series: 377 alias = 'network-health-{}'.format(series) 378 client.remove_service(alias) 379 for series in self.existing_series: 380 alias = 'network-health-{}'.format(series) 381 client.wait_for(WaitApplicationNotPresent(alias)) 382 log.info('Deploying aliased network-health charms') 383 apps = client.get_status().get_applications() 384 for app, info in apps.items(): 385 if 'network-health' not in app: 386 alias = 'network-health-{}'.format(app) 387 client.deploy('~juju-qa/network-health', alias=alias, 388 series=info['series']) 389 try: 390 client.juju('add-relation', (app, alias)) 391 self.expose_test_charms.add(alias) 392 except subprocess.CalledProcessError as e: 393 log.warning('Could not relate {}, {} due to ' 394 'error:\n{}'.format(app, alias, e)) 395 for app in apps.keys(): 396 if 'network-health' not in app: 397 client.wait_for_subordinate_units( 398 app, 'network-health-{}'.format(app)) 399 for app in exposed: 400 client.juju('expose', ('network-health-{}'.format(app))) 401 402 def parse_expose_results(self, service_results, exposed): 403 """Parses expose test results into dict of pass/fail. 404 405 :param service_results: Raw results from expose test 406 :return: Parsed results dict 407 """ 408 results = {'fail': (), 409 'pass': ()} 410 for unit, result in service_results.items(): 411 app = unit.split('/')[0] 412 if app in exposed and result: 413 results['pass'] += (unit,) 414 elif app in exposed and not result: 415 results['fail'] += (unit,) 416 elif app not in exposed and result: 417 results['fail'] += (unit,) 418 else: 419 results['pass'] += (unit,) 420 return results 421 422 def parse_final_results(self, visibility, internet, exposed): 423 """Parses test results and raises an error if any failed. 424 425 :param visibility: Visibility test result 426 :param exposed: Exposure test result 427 """ 428 log.info('Parsing final results.') 429 error_string = [] 430 for nh_source, service_result in visibility.items(): 431 for service, unit_res in service_result.items(): 432 if False in unit_res.values(): 433 failed = [u for u, r in unit_res.items() if r is False] 434 error = ('Unit {0} failed to contact ' 435 'targets(s): {1}'.format(nh_source, failed)) 436 error_string.append(error) 437 for unit, res in internet.items(): 438 if not res: 439 error = 'Machine {} failed internet connection.'.format(unit) 440 error_string.append(error) 441 if exposed and exposed['fail'] is not (): 442 error = ('Application(s) {0} failed expose ' 443 'test'.format(exposed['fail'])) 444 error_string.append(error) 445 return error_string 446 447 def reboot_machines(self, client): 448 log.info("Starting reboot of all containers.") 449 try: 450 for machine, m_info in client.get_status().iter_machines(): 451 cont_ids = [] 452 try: 453 cont_ids.extend([c['instance-id'] for c in 454 m_info.get('containers').values()]) 455 except KeyError: 456 log.info('No containers for machine: {}'.format(machine)) 457 if cont_ids: 458 log.info('Restarting containers: {0} on ' 459 'machine: {1}'.format(cont_ids, machine)) 460 self.ssh(client, machine, 461 'sudo lxc restart {}'.format(' '.join(cont_ids))) 462 log.info("Restarting machine: {}".format(machine)) 463 client.juju('run', ('--machine', machine, 464 'sudo shutdown -r now')) 465 hostname = client.get_status().get_machine_dns_name(machine) 466 wait_for_port(hostname, 22, timeout=240) 467 468 except subprocess.CalledProcessError as e: 469 logging.info( 470 "Error running shutdown:\nstdout: {}\nstderr: {}".format( 471 e.output, getattr(e, 'stderr', None))) 472 client.wait_for_started() 473 474 def ssh(self, client, machine, cmd): 475 """Convenience function: run a juju ssh command and get back the output 476 :param client: A Juju client 477 :param machine: ID of the machine on which to run a command 478 :param cmd: the command to run 479 :return: text output of the command 480 """ 481 back_off = 2 482 attempts = 4 483 for attempt in range(attempts): 484 try: 485 return client.get_juju_output('ssh', '--proxy', machine, 486 cmd) 487 except subprocess.CalledProcessError as e: 488 # If the connection to the host failed, try again in a couple 489 # of seconds. This is usually due to heavy load. 490 if(attempt < attempts - 1 and 491 re.search('ssh_exchange_identification: ' 492 'Connection closed by remote host', e.stderr)): 493 time.sleep(back_off) 494 back_off *= 2 495 else: 496 raise 497 498 def is_ipv6(self, address): 499 try: 500 socket.inet_pton(socket.AF_INET6, address) 501 except socket.error: 502 return False 503 return True 504 505 def to_json(self, units): 506 """Returns a formatted json string to be passed through juju run-action. 507 508 :param units: Dict of units 509 :return: A "JSON-like" string that can be passed to Juju without it 510 puking 511 """ 512 json_string = json.dumps(units, separators=(',', '=')) 513 # Replace curly brackets so juju doesn't think it's JSON and puke 514 json_string = json_string.replace('{', '(') 515 json_string = json_string.replace('}', ')') 516 return json_string 517 518 519 def setup_spaces(maas, bundle=None): 520 """Setup MaaS spaces to test charm bindings. 521 522 Reads from the bundle file and pulls out the required spaces, 523 then adds those spaces to the MaaS cluster using our MaaS 524 controller wrapper. 525 526 :param maas: MaaS manager object 527 :param bundle: Bundle supplied in test 528 """ 529 if not bundle: 530 log.info('No bundle specified, skipping MaaS space assurance') 531 return 532 with open(bundle) as f: 533 data = f.read() 534 bundle_yaml = yaml.load(data) 535 existing_spaces = maas.spaces() 536 new_spaces = _setup_spaces(bundle_yaml, existing_spaces) 537 for space in new_spaces: 538 maas.create_space(space) 539 log.info("Created space: {}".format(space)) 540 541 542 def _setup_spaces(bundle, existing_spaces): 543 log.info("Have spaces: {}".format( 544 ", ".join(s["name"] for s in existing_spaces))) 545 spaces_map = dict((s["name"], s) for s in existing_spaces) 546 required_spaces = {} 547 log.info('Getting spaces from bundle: {}'.format(bundle)) 548 549 for info in bundle['services'].values(): 550 for binding, space in info.get('bindings').items(): 551 required_spaces[binding] = space 552 new_spaces = [] 553 for space_name in required_spaces.values(): 554 space = spaces_map.get(space_name) 555 if not space: 556 new_spaces.append(space_name) 557 return new_spaces 558 559 560 def parse_args(argv): 561 """Parse all arguments.""" 562 parser = argparse.ArgumentParser(description="Test Network Health") 563 add_basic_testing_arguments(parser, existing=False) 564 parser.add_argument('--bundle', help='Bundle to test network against') 565 parser.add_argument('--model', help='Existing Juju model to test against') 566 parser.add_argument('--reboot', type=bool, 567 help='Reboot machines and re-run tests, default=False') 568 parser.add_argument('--maas', type=bool, 569 help='Test under maas') 570 parser.set_defaults(maas=False) 571 parser.set_defaults(reboot=False) 572 parser.set_defaults(series='bionic') 573 return parser.parse_args(argv) 574 575 576 def start_test(client, args, maas): 577 test = AssessNetworkHealth(args) 578 try: 579 test.assess_network_health(client, args.bundle, args.model, 580 args.reboot, args.series, maas) 581 finally: 582 if args.model: 583 test.cleanup(client) 584 log.info('Cleanup complete.') 585 586 587 def start_maas_test(client, args): 588 try: 589 with maas_account_from_boot_config(client.env) as manager: 590 start_test(client, args, manager) 591 except subprocess.CalledProcessError as e: 592 log.warning( 593 'Could not connect to MaaS controller due to error:\n{}'.format(e)) 594 log.warning('Attempting test without ensuring MaaS spaces.') 595 start_test(client, args, None) 596 597 598 def main(argv=None): 599 args = parse_args(argv) 600 configure_logging(args.verbose) 601 if args.model: 602 client = client_for_existing(args.juju_bin, 603 os.environ['JUJU_HOME']) 604 start_test(client, args, None) 605 else: 606 bs_manager = BootstrapManager.from_args(args) 607 if args.maas: 608 bs_manager.client.excluded_spaces = set() 609 bs_manager.client.reserved_spaces = set() 610 with bs_manager.booted_context(args.upload_tools): 611 if args.maas: 612 start_maas_test(bs_manager.client, args) 613 else: 614 start_test(bs_manager.client, args, None) 615 return 0 616 617 618 if __name__ == '__main__': 619 sys.exit(main())