github.com/kubeflow/training-operator@v1.7.0/examples/pytorch/elastic/echo/echo.py

github.com/kubeflow/training-operator@v1.7.0/examples/pytorch/elastic/echo/echo.py (about)

     1  #!/usr/bin/env python3
     2  import io
     3  import os
     4  import pprint
     5  import sys
     6  import time
     7  
     8  import torch.distributed as dist
     9  
    10  
    11  if __name__ == "__main__":
    12  
    13      env_dict = {
    14          k: os.environ[k]
    15          for k in (
    16              "LOCAL_RANK",
    17              "RANK",
    18              "GROUP_RANK",
    19              "WORLD_SIZE",
    20              "MASTER_ADDR",
    21              "MASTER_PORT",
    22              "TORCHELASTIC_RESTART_COUNT",
    23              "TORCHELASTIC_MAX_RESTARTS",
    24          )
    25      }
    26  
    27      with io.StringIO() as buff:
    28          print("======================================================", file=buff)
    29          print(
    30              f"Environment variables set by the agent on PID {os.getpid()}:", file=buff
    31          )
    32          pprint.pprint(env_dict, stream=buff)
    33          print("======================================================", file=buff)
    34          print(buff.getvalue())
    35          sys.stdout.flush()
    36  
    37      dist.init_process_group(backend="gloo")
    38      dist.barrier()
    39  
    40      print(
    41          (
    42              f"On PID {os.getpid()}, after init process group, "
    43              f"rank={dist.get_rank()}, world_size = {dist.get_world_size()}\n"
    44          )
    45      )