github.com/kubeflow/training-operator@v1.7.0/examples/pytorch/elastic/echo/echo.py (about) 1 #!/usr/bin/env python3 2 import io 3 import os 4 import pprint 5 import sys 6 import time 7 8 import torch.distributed as dist 9 10 11 if __name__ == "__main__": 12 13 env_dict = { 14 k: os.environ[k] 15 for k in ( 16 "LOCAL_RANK", 17 "RANK", 18 "GROUP_RANK", 19 "WORLD_SIZE", 20 "MASTER_ADDR", 21 "MASTER_PORT", 22 "TORCHELASTIC_RESTART_COUNT", 23 "TORCHELASTIC_MAX_RESTARTS", 24 ) 25 } 26 27 with io.StringIO() as buff: 28 print("======================================================", file=buff) 29 print( 30 f"Environment variables set by the agent on PID {os.getpid()}:", file=buff 31 ) 32 pprint.pprint(env_dict, stream=buff) 33 print("======================================================", file=buff) 34 print(buff.getvalue()) 35 sys.stdout.flush() 36 37 dist.init_process_group(backend="gloo") 38 dist.barrier() 39 40 print( 41 ( 42 f"On PID {os.getpid()}, after init process group, " 43 f"rank={dist.get_rank()}, world_size = {dist.get_world_size()}\n" 44 ) 45 )