github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/python/aistore/sdk/cluster.py (about) 1 # 2 # Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. 3 # 4 5 from __future__ import annotations # pylint: disable=unused-variable 6 7 import logging 8 from typing import List, Optional 9 10 from aistore.sdk.const import ( 11 HTTP_METHOD_GET, 12 ACT_LIST, 13 PROVIDER_AIS, 14 QPARAM_WHAT, 15 QPARAM_PRIMARY_READY_REB, 16 QPARAM_PROVIDER, 17 HEADER_NODE_ID, 18 URL_PATH_ETL, 19 URL_PATH_REVERSE, 20 URL_PATH_BUCKETS, 21 URL_PATH_HEALTH, 22 URL_PATH_DAEMON, 23 URL_PATH_CLUSTER, 24 WHAT_SMAP, 25 WHAT_ALL_XACT_STATUS, 26 WHAT_ALL_RUNNING_STATUS, 27 WHAT_NODE_STATS_AND_STATUS, 28 WHAT_NODE_STATS_AND_STATUS_V322, 29 ) 30 31 from aistore.sdk.types import ( 32 BucketModel, 33 JobStatus, 34 JobQuery, 35 ETLInfo, 36 ActionMsg, 37 Smap, 38 NodeStats, 39 NodeStatsV322, 40 ClusterPerformance, 41 NodeCounter, 42 NodeLatency, 43 NodeThroughput, 44 ) 45 from aistore.sdk.request_client import RequestClient 46 from aistore.sdk.errors import AISError 47 48 logger = logging.getLogger("cluster") 49 50 51 # pylint: disable=unused-variable 52 class Cluster: 53 """ 54 A class representing a cluster bound to an AIS client. 55 """ 56 57 # pylint: disable=duplicate-code 58 def __init__(self, client: RequestClient): 59 self._client = client 60 61 @property 62 def client(self): 63 """Client this cluster uses to make requests""" 64 return self._client 65 66 def get_info(self) -> Smap: 67 """ 68 Returns state of AIS cluster, including the detailed information about its nodes. 69 70 Returns: 71 aistore.sdk.types.Smap: Smap containing cluster information 72 73 Raises: 74 requests.RequestException: "There was an ambiguous exception that occurred while handling..." 75 requests.ConnectionError: Connection error 76 requests.ConnectionTimeout: Timed out connecting to AIStore 77 requests.ReadTimeout: Timed out waiting response from AIStore 78 """ 79 return self._get_smap() 80 81 def get_primary_url(self) -> str: 82 """ 83 Returns: URL of primary proxy 84 """ 85 return self._get_smap().proxy_si.public_net.direct_url 86 87 def list_buckets(self, provider: str = PROVIDER_AIS): 88 """ 89 Returns list of buckets in AIStore cluster. 90 91 Args: 92 provider (str, optional): Name of bucket provider, one of "ais", "aws", "gcp", "az" or "ht". 93 Defaults to "ais". Empty provider returns buckets of all providers. 94 95 Returns: 96 List[BucketModel]: A list of buckets 97 98 Raises: 99 requests.RequestException: "There was an ambiguous exception that occurred while handling..." 100 requests.ConnectionError: Connection error 101 requests.ConnectionTimeout: Timed out connecting to AIStore 102 requests.ReadTimeout: Timed out waiting response from AIStore 103 """ 104 params = {QPARAM_PROVIDER: provider} 105 action = ActionMsg(action=ACT_LIST).dict() 106 107 return self.client.request_deserialize( 108 HTTP_METHOD_GET, 109 path=URL_PATH_BUCKETS, 110 res_model=List[BucketModel], 111 json=action, 112 params=params, 113 ) 114 115 def list_jobs_status(self, job_kind="", target_id="") -> List[JobStatus]: 116 """ 117 List the status of jobs on the cluster 118 119 Args: 120 job_kind (str, optional): Only show jobs of a particular type 121 target_id (str, optional): Limit to jobs on a specific target node 122 123 Returns: 124 List of JobStatus objects 125 """ 126 res = self._client.request_deserialize( 127 HTTP_METHOD_GET, 128 path=URL_PATH_CLUSTER, 129 res_model=Optional[List[JobStatus]], 130 json=JobQuery(kind=job_kind, target=target_id).as_dict(), 131 params={QPARAM_WHAT: WHAT_ALL_XACT_STATUS}, 132 ) 133 if res is None: 134 return [] 135 return res 136 137 def list_running_jobs(self, job_kind="", target_id="") -> List[str]: 138 """ 139 List the currently running jobs on the cluster 140 141 Args: 142 job_kind (str, optional): Only show jobs of a particular type 143 target_id (str, optional): Limit to jobs on a specific target node 144 145 Returns: 146 List of jobs in the format job_kind[job_id] 147 """ 148 return self._client.request_deserialize( 149 HTTP_METHOD_GET, 150 path=URL_PATH_CLUSTER, 151 res_model=List[str], 152 json=JobQuery(kind=job_kind, target=target_id, active=True).as_dict(), 153 params={QPARAM_WHAT: WHAT_ALL_RUNNING_STATUS}, 154 ) 155 156 def list_running_etls(self) -> List[ETLInfo]: 157 """ 158 Lists all running ETLs. 159 160 Note: Does not list ETLs that have been stopped or deleted. 161 162 Returns: 163 List[ETLInfo]: A list of details on running ETLs 164 """ 165 return self._client.request_deserialize( 166 HTTP_METHOD_GET, path=URL_PATH_ETL, res_model=List[ETLInfo] 167 ) 168 169 def is_ready(self) -> bool: 170 """ 171 Checks if cluster is ready or still setting up. 172 173 Returns: 174 bool: True if cluster is ready, or false if cluster is still setting up 175 """ 176 # compare with AIS Go API (api/cluster.go) for additional supported options 177 params = {QPARAM_PRIMARY_READY_REB: "true"} 178 try: 179 resp = self._client.request( 180 HTTP_METHOD_GET, 181 path=URL_PATH_HEALTH, 182 endpoint=self.get_primary_url(), 183 params=params, 184 ) 185 return resp.ok 186 except Exception as err: 187 logger.debug(err) 188 return False 189 190 # pylint: disable=too-many-locals 191 def get_performance( 192 self, 193 get_throughput: bool = True, 194 get_latency: bool = True, 195 get_counters: bool = True, 196 ) -> ClusterPerformance: 197 """ 198 Retrieves and calculates the performance metrics for each target node in the AIStore cluster. 199 It compiles throughput, latency, and various operational counters from each target node, 200 providing a comprehensive view of the cluster's overall performance 201 202 Args: 203 get_throughput (bool, optional): get cluster throughput 204 get_latency (bool, optional): get cluster latency 205 get_counters (bool, optional): get cluster counters 206 207 Returns: 208 ClusterPerformance: An object encapsulating the detailed performance metrics of the cluster, 209 including throughput, latency, and counters for each node 210 211 Raises: 212 requests.RequestException: If there's an ambiguous exception while processing the request 213 requests.ConnectionError: If there's a connection error with the cluster 214 requests.ConnectionTimeout: If the connection to the cluster times out 215 requests.ReadTimeout: If the timeout is reached while awaiting a response from the cluster 216 """ 217 218 targets = self._get_targets() 219 target_stats = {} 220 params = {QPARAM_WHAT: WHAT_NODE_STATS_AND_STATUS} 221 res_model = NodeStats 222 for target_id in targets: 223 headers = {HEADER_NODE_ID: target_id} 224 try: 225 res = self.client.request_deserialize( 226 HTTP_METHOD_GET, 227 path=f"{URL_PATH_REVERSE}/{URL_PATH_DAEMON}", 228 res_model=res_model, 229 headers=headers, 230 params=params, 231 ) 232 except AISError as err: 233 if "unrecognized what=node_status" in err.message: 234 params = {QPARAM_WHAT: WHAT_NODE_STATS_AND_STATUS_V322} 235 res_model = NodeStatsV322 236 res = self.client.request_deserialize( 237 HTTP_METHOD_GET, 238 path=f"{URL_PATH_REVERSE}/{URL_PATH_DAEMON}", 239 res_model=res_model, 240 headers=headers, 241 params=params, 242 ) 243 else: 244 raise err 245 target_stats[target_id] = res 246 throughputs = {} 247 latencies = {} 248 counters = {} 249 for target_id, val in target_stats.items(): 250 if get_throughput: 251 throughputs[target_id] = NodeThroughput(val.tracker) 252 if get_latency: 253 latencies[target_id] = NodeLatency(val.tracker) 254 if get_counters: 255 counters[target_id] = NodeCounter(val.tracker) 256 257 return ClusterPerformance( 258 throughput=throughputs, latency=latencies, counters=counters 259 ) 260 261 def _get_smap(self): 262 return self.client.request_deserialize( 263 HTTP_METHOD_GET, 264 path=URL_PATH_DAEMON, 265 res_model=Smap, 266 params={QPARAM_WHAT: WHAT_SMAP}, 267 ) 268 269 def _get_targets(self): 270 tmap = self._get_smap().tmap 271 return list(tmap.keys())