sigs.k8s.io/kueue@v0.6.2/site/static/examples/python/sample-mpijob.py

sigs.k8s.io/kueue@v0.6.2/site/static/examples/python/sample-mpijob.py (about)

     1  #!/usr/bin/env python3
     2  
     3  import argparse
     4  from kubernetes import config, client
     5  import mpijob.models as models
     6  
     7  # sample-mpijob.py
     8  # This example will demonstrate full steps to submit a Job via the MPI Operator
     9  
    10  # Make sure your cluster is running!
    11  config.load_kube_config()
    12  crd_api = client.CustomObjectsApi()
    13  api_client = crd_api.api_client
    14  
    15  
    16  def get_parser():
    17      parser = argparse.ArgumentParser(
    18          description="Submit Kueue MPI Operator Job Example",
    19          formatter_class=argparse.RawTextHelpFormatter,
    20      )
    21      parser.add_argument(
    22          "--job-name",
    23          help="generateName field to set for job (job prefix does not work here)",
    24          default="pi",
    25      )
    26      parser.add_argument(
    27          "--image",
    28          help="container image to use",
    29          default="mpioperator/mpi-pi:openmpi",
    30      )
    31      parser.add_argument(
    32          "--command",
    33          help="command to run",
    34          default="mpirun",
    35      )
    36      parser.add_argument(
    37          "--args",
    38          nargs="+",
    39          help="args for container",
    40          default=["-n", "2", "/home/mpiuser/pi"],
    41      )
    42      return parser
    43  
    44  
    45  def generate_job_crd(job_name, image, command, args):
    46      """
    47      Generate an equivalent job CRD to sample-job.yaml
    48      """
    49      metadata = client.V1ObjectMeta(
    50          name=job_name, labels={"kueue.x-k8s.io/queue-name": "user-queue"}
    51      )
    52  
    53      # containers for launcher and worker
    54      launcher_container = client.V1Container(
    55          image=image,
    56          name="mpi-launcher",
    57          command=[command],
    58          args=args,
    59          security_context=client.V1SecurityContext(run_as_user=1000),
    60          resources={
    61              "limits": {
    62                  "cpu": 1,
    63                  "memory": "1Gi",
    64              }
    65          },
    66      )
    67  
    68      worker_container = client.V1Container(
    69          image=image,
    70          name="mpi-worker",
    71          command=["/usr/sbin/sshd"],
    72          args=["-De", "-f", "/home/mpiuser/.sshd_config"],
    73          security_context=client.V1SecurityContext(run_as_user=1000),
    74          resources={
    75              "limits": {
    76                  "cpu": 1,
    77                  "memory": "1Gi",
    78              }
    79          },
    80      )
    81  
    82      # Create the Launcher and worker replica specs
    83      launcher = models.V2beta1ReplicaSpec(
    84          replicas=1,
    85          template=client.V1PodTemplateSpec(
    86              spec=client.V1PodSpec(containers=[launcher_container])
    87          ),
    88      )
    89  
    90      worker = models.V2beta1ReplicaSpec(
    91          replicas=2,
    92          template=client.V1PodTemplateSpec(
    93              spec=client.V1PodSpec(containers=[worker_container])
    94          ),
    95      )
    96  
    97      # runPolicy for jobspec
    98      policy = models.V2beta1RunPolicy(
    99          clean_pod_policy="Running", ttl_seconds_after_finished=60
   100      )
   101  
   102      # Create the jobspec
   103      jobspec = models.V2beta1MPIJobSpec(
   104          slots_per_worker=1,
   105          run_policy=policy,
   106          ssh_auth_mount_path="/home/mpiuser/.ssh",
   107          mpi_replica_specs={"Launcher": launcher, "Worker": worker},
   108      )
   109      return models.V2beta1MPIJob(
   110          metadata=metadata,
   111          api_version="kubeflow.org/v2beta1",
   112          kind="MPIJob",
   113          spec=jobspec,
   114      )
   115  
   116  
   117  def main():
   118      """
   119      Run an MPI job. This requires the MPI Operator to be installed.
   120      """
   121      parser = get_parser()
   122      args, _ = parser.parse_known_args()
   123  
   124      # Generate a CRD spec
   125      crd = generate_job_crd(args.job_name, args.image, args.command, args.args)
   126      crd_api = client.CustomObjectsApi()
   127  
   128      print(f"📦️ Container image selected is {args.image}...")
   129      print(f"⭐️ Creating sample job with prefix {args.job_name}...")
   130      crd_api.create_namespaced_custom_object(
   131          group="kubeflow.org",
   132          version="v2beta1",
   133          namespace="default",
   134          plural="mpijobs",
   135          body=crd,
   136      )
   137      print(
   138          'Use:\n"kubectl get queue" to see queue assignment\n"kubectl get jobs" to see jobs'
   139      )
   140  
   141  
   142  if __name__ == "__main__":
   143      main()