github.com/kubeflow/training-operator@v1.7.0/sdk/python/test/e2e/utils.py (about)

     1  import logging
     2  import time
     3  
     4  from kubeflow.training import TrainingClient
     5  from kubeflow.training.constants import constants
     6  from test.e2e.constants import TEST_GANG_SCHEDULER_NAME_SCHEDULER_PLUGINS
     7  from test.e2e.constants import DEFAULT_SCHEDULER_PLUGINS_NAME
     8  from test.e2e.constants import TEST_GANG_SCHEDULER_NAME_VOLCANO
     9  
    10  logging.basicConfig(format="%(message)s")
    11  logging.getLogger().setLevel(logging.INFO)
    12  
    13  
    14  def verify_unschedulable_job_e2e(
    15      client: TrainingClient, name: str, namespace: str, job_kind: str
    16  ):
    17      """Verify unschedulable Training Job e2e test."""
    18      logging.info(f"\n\n\n{job_kind} is creating")
    19      client.wait_for_job_conditions(name, namespace, job_kind, {constants.JOB_CONDITION_CREATED})
    20  
    21      logging.info("Checking 3 times that pods are not scheduled")
    22      for num in range(3):
    23          logging.info(f"Number of attempts: {int(num)+1}/3")
    24          # Job should have a Created condition.
    25          if not client.is_job_created(name, namespace, job_kind):
    26              raise Exception(f"{job_kind} should be in Created condition")
    27  
    28          # Job shouldn't have a Running condition.
    29          if client.is_job_running(name, namespace, job_kind):
    30              raise Exception(f"{job_kind} shouldn't be in Running condition")
    31  
    32          logging.info("Sleeping 5 seconds...")
    33          time.sleep(5)
    34  
    35  
    36  def verify_job_e2e(
    37      client: TrainingClient, name: str, namespace: str, job_kind: str, container: str, timeout: int = 600
    38  ):
    39      """Verify Training Job e2e test."""
    40  
    41      # Wait until Job is Succeeded.
    42      logging.info(f"\n\n\n{job_kind} is running")
    43      client.wait_for_job_conditions(name, namespace, job_kind, timeout=timeout)
    44  
    45      # Job should have Created, Running, and Succeeded conditions.
    46      conditions = client.get_job_conditions(name, namespace, job_kind)
    47      if len(conditions) != 3:
    48          raise Exception(f"{job_kind} conditions are invalid: {conditions}")
    49  
    50      # Job should have correct conditions.
    51      if not client.is_job_created(name, namespace, job_kind):
    52          raise Exception(f"{job_kind} should be in Created condition")
    53  
    54      if client.is_job_running(name, namespace, job_kind):
    55          raise Exception(f"{job_kind} should not be in Running condition")
    56  
    57      if client.is_job_restarting(name, namespace, job_kind):
    58          raise Exception(f"{job_kind} should not be in Restarting condition")
    59  
    60      if not client.is_job_succeeded(name, namespace, job_kind):
    61          raise Exception(f"{job_kind} should be in Succeeded condition")
    62  
    63      if client.is_job_failed(name, namespace, job_kind):
    64          raise Exception(f"{job_kind} should not be in Failed condition")
    65  
    66      # Print Job pod names.
    67      logging.info(f"\n\n\n{job_kind} pod names")
    68      logging.info(client.get_job_pod_names(name, namespace))
    69  
    70      # Print Job logs.
    71      logging.info(f"\n\n\n{job_kind} logs")
    72      client.get_job_logs(name, namespace, container=container)
    73  
    74  
    75  def get_pod_spec_scheduler_name(gang_scheduler_name: str) -> str:
    76      if gang_scheduler_name == TEST_GANG_SCHEDULER_NAME_SCHEDULER_PLUGINS:
    77          return DEFAULT_SCHEDULER_PLUGINS_NAME
    78      elif gang_scheduler_name == TEST_GANG_SCHEDULER_NAME_VOLCANO:
    79          return TEST_GANG_SCHEDULER_NAME_VOLCANO
    80  
    81      return ""