github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/python/aistore/sdk/dsort.py (about)

     1  import json
     2  import logging
     3  import time
     4  from typing import Dict
     5  
     6  from aistore.sdk.const import (
     7      HTTP_METHOD_POST,
     8      URL_PATH_DSORT,
     9      HTTP_METHOD_GET,
    10      DEFAULT_DSORT_WAIT_TIMEOUT,
    11      HTTP_METHOD_DELETE,
    12      DSORT_ABORT,
    13      DSORT_UUID,
    14  )
    15  from aistore.sdk.dsort_types import JobInfo
    16  from aistore.sdk.errors import Timeout
    17  from aistore.sdk.utils import validate_file, probing_frequency
    18  
    19  
    20  class Dsort:
    21      """
    22      Class for managing jobs for the dSort extension: https://github.com/NVIDIA/aistore/blob/main/docs/cli/dsort.md
    23      """
    24  
    25      def __init__(self, client: "Client", dsort_id: str = ""):
    26          self._client = client
    27          self._dsort_id = dsort_id
    28  
    29      @property
    30      def dsort_id(self) -> str:
    31          """
    32          Return dSort job id
    33          """
    34          return self._dsort_id
    35  
    36      def start(self, spec_file: str) -> str:
    37          """
    38          Start a dSort job with a provided spec file location
    39          Returns:
    40              dSort job ID
    41          """
    42          validate_file(spec_file)
    43          with open(spec_file, "r", encoding="utf-8") as file_data:
    44              spec = json.load(file_data)
    45              self._dsort_id = self._client.request(
    46                  HTTP_METHOD_POST, path=URL_PATH_DSORT, json=spec
    47              ).text
    48          return self._dsort_id
    49  
    50      def abort(self):
    51          """
    52          Abort a dSort job
    53          """
    54          qparam = {DSORT_UUID: [self._dsort_id]}
    55          self._client.request(
    56              HTTP_METHOD_DELETE, path=f"{URL_PATH_DSORT}/{DSORT_ABORT}", params=qparam
    57          )
    58  
    59      def get_job_info(self) -> Dict[str, JobInfo]:
    60          """
    61          Get info for a dsort job
    62          Returns:
    63              Dictionary of job info for all jobs associated with this dsort
    64          """
    65          qparam = {DSORT_UUID: [self._dsort_id]}
    66          return self._client.request_deserialize(
    67              HTTP_METHOD_GET,
    68              path=URL_PATH_DSORT,
    69              res_model=Dict[str, JobInfo],
    70              params=qparam,
    71          )
    72  
    73      def wait(
    74          self,
    75          timeout: int = DEFAULT_DSORT_WAIT_TIMEOUT,
    76          verbose: bool = True,
    77      ):
    78          """
    79          Wait for a dSort job to finish
    80  
    81          Args:
    82              timeout (int, optional): The maximum time to wait for the job, in seconds. Default timeout is 5 minutes.
    83              verbose (bool, optional): Whether to log wait status to standard output
    84  
    85          Raises:
    86              requests.RequestException: "There was an ambiguous exception that occurred while handling..."
    87              requests.ConnectionError: Connection error
    88              requests.ConnectionTimeout: Timed out connecting to AIStore
    89              requests.ReadTimeout: Timed out waiting response from AIStore
    90              errors.Timeout: Timeout while waiting for the job to finish
    91          """
    92          logger = logging.getLogger(f"{__name__}.wait")
    93          logger.disabled = not verbose
    94          passed = 0
    95          sleep_time = probing_frequency(timeout)
    96          while True:
    97              if passed > timeout:
    98                  raise Timeout("dsort job to finish")
    99              finished = True
   100              for job_info in self.get_job_info().values():
   101                  if job_info.metrics.aborted:
   102                      logger.info("DSort job '%s' aborted", self._dsort_id)
   103                      return
   104                  # Shard creation is the last phase, so check if it's finished
   105                  finished = job_info.metrics.shard_creation.finished and finished
   106              if finished:
   107                  logger.info("DSort job '%s' finished", self._dsort_id)
   108                  return
   109              logger.info("Waiting on dsort job '%s'...", self._dsort_id)
   110              time.sleep(sleep_time)
   111              passed += sleep_time