github.com/kubeflow/training-operator@v1.7.0/sdk/python/test/e2e/utils.py (about) 1 import logging 2 import time 3 4 from kubeflow.training import TrainingClient 5 from kubeflow.training.constants import constants 6 from test.e2e.constants import TEST_GANG_SCHEDULER_NAME_SCHEDULER_PLUGINS 7 from test.e2e.constants import DEFAULT_SCHEDULER_PLUGINS_NAME 8 from test.e2e.constants import TEST_GANG_SCHEDULER_NAME_VOLCANO 9 10 logging.basicConfig(format="%(message)s") 11 logging.getLogger().setLevel(logging.INFO) 12 13 14 def verify_unschedulable_job_e2e( 15 client: TrainingClient, name: str, namespace: str, job_kind: str 16 ): 17 """Verify unschedulable Training Job e2e test.""" 18 logging.info(f"\n\n\n{job_kind} is creating") 19 client.wait_for_job_conditions(name, namespace, job_kind, {constants.JOB_CONDITION_CREATED}) 20 21 logging.info("Checking 3 times that pods are not scheduled") 22 for num in range(3): 23 logging.info(f"Number of attempts: {int(num)+1}/3") 24 # Job should have a Created condition. 25 if not client.is_job_created(name, namespace, job_kind): 26 raise Exception(f"{job_kind} should be in Created condition") 27 28 # Job shouldn't have a Running condition. 29 if client.is_job_running(name, namespace, job_kind): 30 raise Exception(f"{job_kind} shouldn't be in Running condition") 31 32 logging.info("Sleeping 5 seconds...") 33 time.sleep(5) 34 35 36 def verify_job_e2e( 37 client: TrainingClient, name: str, namespace: str, job_kind: str, container: str, timeout: int = 600 38 ): 39 """Verify Training Job e2e test.""" 40 41 # Wait until Job is Succeeded. 42 logging.info(f"\n\n\n{job_kind} is running") 43 client.wait_for_job_conditions(name, namespace, job_kind, timeout=timeout) 44 45 # Job should have Created, Running, and Succeeded conditions. 46 conditions = client.get_job_conditions(name, namespace, job_kind) 47 if len(conditions) != 3: 48 raise Exception(f"{job_kind} conditions are invalid: {conditions}") 49 50 # Job should have correct conditions. 51 if not client.is_job_created(name, namespace, job_kind): 52 raise Exception(f"{job_kind} should be in Created condition") 53 54 if client.is_job_running(name, namespace, job_kind): 55 raise Exception(f"{job_kind} should not be in Running condition") 56 57 if client.is_job_restarting(name, namespace, job_kind): 58 raise Exception(f"{job_kind} should not be in Restarting condition") 59 60 if not client.is_job_succeeded(name, namespace, job_kind): 61 raise Exception(f"{job_kind} should be in Succeeded condition") 62 63 if client.is_job_failed(name, namespace, job_kind): 64 raise Exception(f"{job_kind} should not be in Failed condition") 65 66 # Print Job pod names. 67 logging.info(f"\n\n\n{job_kind} pod names") 68 logging.info(client.get_job_pod_names(name, namespace)) 69 70 # Print Job logs. 71 logging.info(f"\n\n\n{job_kind} logs") 72 client.get_job_logs(name, namespace, container=container) 73 74 75 def get_pod_spec_scheduler_name(gang_scheduler_name: str) -> str: 76 if gang_scheduler_name == TEST_GANG_SCHEDULER_NAME_SCHEDULER_PLUGINS: 77 return DEFAULT_SCHEDULER_PLUGINS_NAME 78 elif gang_scheduler_name == TEST_GANG_SCHEDULER_NAME_VOLCANO: 79 return TEST_GANG_SCHEDULER_NAME_VOLCANO 80 81 return ""