github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/python/aistore/sdk/cluster.py

github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/python/aistore/sdk/cluster.py (about)

     1  #
     2  # Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
     3  #
     4  
     5  from __future__ import annotations  # pylint: disable=unused-variable
     6  
     7  import logging
     8  from typing import List, Optional
     9  
    10  from aistore.sdk.const import (
    11      HTTP_METHOD_GET,
    12      ACT_LIST,
    13      PROVIDER_AIS,
    14      QPARAM_WHAT,
    15      QPARAM_PRIMARY_READY_REB,
    16      QPARAM_PROVIDER,
    17      HEADER_NODE_ID,
    18      URL_PATH_ETL,
    19      URL_PATH_REVERSE,
    20      URL_PATH_BUCKETS,
    21      URL_PATH_HEALTH,
    22      URL_PATH_DAEMON,
    23      URL_PATH_CLUSTER,
    24      WHAT_SMAP,
    25      WHAT_ALL_XACT_STATUS,
    26      WHAT_ALL_RUNNING_STATUS,
    27      WHAT_NODE_STATS_AND_STATUS,
    28      WHAT_NODE_STATS_AND_STATUS_V322,
    29  )
    30  
    31  from aistore.sdk.types import (
    32      BucketModel,
    33      JobStatus,
    34      JobQuery,
    35      ETLInfo,
    36      ActionMsg,
    37      Smap,
    38      NodeStats,
    39      NodeStatsV322,
    40      ClusterPerformance,
    41      NodeCounter,
    42      NodeLatency,
    43      NodeThroughput,
    44  )
    45  from aistore.sdk.request_client import RequestClient
    46  from aistore.sdk.errors import AISError
    47  
    48  logger = logging.getLogger("cluster")
    49  
    50  
    51  # pylint: disable=unused-variable
    52  class Cluster:
    53      """
    54      A class representing a cluster bound to an AIS client.
    55      """
    56  
    57      # pylint: disable=duplicate-code
    58      def __init__(self, client: RequestClient):
    59          self._client = client
    60  
    61      @property
    62      def client(self):
    63          """Client this cluster uses to make requests"""
    64          return self._client
    65  
    66      def get_info(self) -> Smap:
    67          """
    68          Returns state of AIS cluster, including the detailed information about its nodes.
    69  
    70          Returns:
    71              aistore.sdk.types.Smap: Smap containing cluster information
    72  
    73          Raises:
    74              requests.RequestException: "There was an ambiguous exception that occurred while handling..."
    75              requests.ConnectionError: Connection error
    76              requests.ConnectionTimeout: Timed out connecting to AIStore
    77              requests.ReadTimeout: Timed out waiting response from AIStore
    78          """
    79          return self._get_smap()
    80  
    81      def get_primary_url(self) -> str:
    82          """
    83          Returns: URL of primary proxy
    84          """
    85          return self._get_smap().proxy_si.public_net.direct_url
    86  
    87      def list_buckets(self, provider: str = PROVIDER_AIS):
    88          """
    89          Returns list of buckets in AIStore cluster.
    90  
    91          Args:
    92              provider (str, optional): Name of bucket provider, one of "ais", "aws", "gcp", "az" or "ht".
    93              Defaults to "ais". Empty provider returns buckets of all providers.
    94  
    95          Returns:
    96              List[BucketModel]: A list of buckets
    97  
    98          Raises:
    99              requests.RequestException: "There was an ambiguous exception that occurred while handling..."
   100              requests.ConnectionError: Connection error
   101              requests.ConnectionTimeout: Timed out connecting to AIStore
   102              requests.ReadTimeout: Timed out waiting response from AIStore
   103          """
   104          params = {QPARAM_PROVIDER: provider}
   105          action = ActionMsg(action=ACT_LIST).dict()
   106  
   107          return self.client.request_deserialize(
   108              HTTP_METHOD_GET,
   109              path=URL_PATH_BUCKETS,
   110              res_model=List[BucketModel],
   111              json=action,
   112              params=params,
   113          )
   114  
   115      def list_jobs_status(self, job_kind="", target_id="") -> List[JobStatus]:
   116          """
   117          List the status of jobs on the cluster
   118  
   119          Args:
   120              job_kind (str, optional): Only show jobs of a particular type
   121              target_id (str, optional): Limit to jobs on a specific target node
   122  
   123          Returns:
   124              List of JobStatus objects
   125          """
   126          res = self._client.request_deserialize(
   127              HTTP_METHOD_GET,
   128              path=URL_PATH_CLUSTER,
   129              res_model=Optional[List[JobStatus]],
   130              json=JobQuery(kind=job_kind, target=target_id).as_dict(),
   131              params={QPARAM_WHAT: WHAT_ALL_XACT_STATUS},
   132          )
   133          if res is None:
   134              return []
   135          return res
   136  
   137      def list_running_jobs(self, job_kind="", target_id="") -> List[str]:
   138          """
   139          List the currently running jobs on the cluster
   140  
   141          Args:
   142              job_kind (str, optional): Only show jobs of a particular type
   143              target_id (str, optional): Limit to jobs on a specific target node
   144  
   145          Returns:
   146              List of jobs in the format job_kind[job_id]
   147          """
   148          return self._client.request_deserialize(
   149              HTTP_METHOD_GET,
   150              path=URL_PATH_CLUSTER,
   151              res_model=List[str],
   152              json=JobQuery(kind=job_kind, target=target_id, active=True).as_dict(),
   153              params={QPARAM_WHAT: WHAT_ALL_RUNNING_STATUS},
   154          )
   155  
   156      def list_running_etls(self) -> List[ETLInfo]:
   157          """
   158          Lists all running ETLs.
   159  
   160          Note: Does not list ETLs that have been stopped or deleted.
   161  
   162          Returns:
   163              List[ETLInfo]: A list of details on running ETLs
   164          """
   165          return self._client.request_deserialize(
   166              HTTP_METHOD_GET, path=URL_PATH_ETL, res_model=List[ETLInfo]
   167          )
   168  
   169      def is_ready(self) -> bool:
   170          """
   171          Checks if cluster is ready or still setting up.
   172  
   173          Returns:
   174              bool: True if cluster is ready, or false if cluster is still setting up
   175          """
   176          # compare with AIS Go API (api/cluster.go) for additional supported options
   177          params = {QPARAM_PRIMARY_READY_REB: "true"}
   178          try:
   179              resp = self._client.request(
   180                  HTTP_METHOD_GET,
   181                  path=URL_PATH_HEALTH,
   182                  endpoint=self.get_primary_url(),
   183                  params=params,
   184              )
   185              return resp.ok
   186          except Exception as err:
   187              logger.debug(err)
   188              return False
   189  
   190      # pylint: disable=too-many-locals
   191      def get_performance(
   192          self,
   193          get_throughput: bool = True,
   194          get_latency: bool = True,
   195          get_counters: bool = True,
   196      ) -> ClusterPerformance:
   197          """
   198          Retrieves and calculates the performance metrics for each target node in the AIStore cluster.
   199          It compiles throughput, latency, and various operational counters from each target node,
   200          providing a comprehensive view of the cluster's overall performance
   201  
   202          Args:
   203              get_throughput (bool, optional): get cluster throughput
   204              get_latency (bool, optional): get cluster latency
   205              get_counters (bool, optional): get cluster counters
   206  
   207          Returns:
   208              ClusterPerformance: An object encapsulating the detailed performance metrics of the cluster,
   209                  including throughput, latency, and counters for each node
   210  
   211          Raises:
   212              requests.RequestException: If there's an ambiguous exception while processing the request
   213              requests.ConnectionError: If there's a connection error with the cluster
   214              requests.ConnectionTimeout: If the connection to the cluster times out
   215              requests.ReadTimeout: If the timeout is reached while awaiting a response from the cluster
   216          """
   217  
   218          targets = self._get_targets()
   219          target_stats = {}
   220          params = {QPARAM_WHAT: WHAT_NODE_STATS_AND_STATUS}
   221          res_model = NodeStats
   222          for target_id in targets:
   223              headers = {HEADER_NODE_ID: target_id}
   224              try:
   225                  res = self.client.request_deserialize(
   226                      HTTP_METHOD_GET,
   227                      path=f"{URL_PATH_REVERSE}/{URL_PATH_DAEMON}",
   228                      res_model=res_model,
   229                      headers=headers,
   230                      params=params,
   231                  )
   232              except AISError as err:
   233                  if "unrecognized what=node_status" in err.message:
   234                      params = {QPARAM_WHAT: WHAT_NODE_STATS_AND_STATUS_V322}
   235                      res_model = NodeStatsV322
   236                      res = self.client.request_deserialize(
   237                          HTTP_METHOD_GET,
   238                          path=f"{URL_PATH_REVERSE}/{URL_PATH_DAEMON}",
   239                          res_model=res_model,
   240                          headers=headers,
   241                          params=params,
   242                      )
   243                  else:
   244                      raise err
   245              target_stats[target_id] = res
   246          throughputs = {}
   247          latencies = {}
   248          counters = {}
   249          for target_id, val in target_stats.items():
   250              if get_throughput:
   251                  throughputs[target_id] = NodeThroughput(val.tracker)
   252              if get_latency:
   253                  latencies[target_id] = NodeLatency(val.tracker)
   254              if get_counters:
   255                  counters[target_id] = NodeCounter(val.tracker)
   256  
   257          return ClusterPerformance(
   258              throughput=throughputs, latency=latencies, counters=counters
   259          )
   260  
   261      def _get_smap(self):
   262          return self.client.request_deserialize(
   263              HTTP_METHOD_GET,
   264              path=URL_PATH_DAEMON,
   265              res_model=Smap,
   266              params={QPARAM_WHAT: WHAT_SMAP},
   267          )
   268  
   269      def _get_targets(self):
   270          tmap = self._get_smap().tmap
   271          return list(tmap.keys())