github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/python/aistore/pytorch/utils.py (about)

     1  """
     2  Utils for AIS PyTorch Plugin
     3  
     4  Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
     5  """
     6  
     7  from typing import List, Tuple, Iterable
     8  from urllib.parse import urlparse, urlunparse, parse_qs
     9  from aistore.sdk import Client
    10  from aistore.sdk.ais_source import AISSource
    11  from aistore.sdk.object import Object
    12  
    13  
    14  def parse_url(url: str) -> Tuple[str, str, str]:
    15      """
    16      Parse AIS urls for bucket and object names
    17      Args:
    18          url (str): Complete URL of the object (eg. "ais://bucket1/file.txt")
    19      Returns:
    20          provider (str): AIS Backend
    21          bck_name (str): Bucket name identifier
    22          obj_name (str):  Object name with extension
    23      """
    24      parsed_url = urlparse(url)
    25      path = parsed_url.path
    26      if len(path) > 0 and path.startswith("/"):
    27          path = path[1:]
    28  
    29      # returns provider, bck_name, path
    30      return parsed_url.scheme, parsed_url.netloc, path
    31  
    32  
    33  # pylint: disable=unused-variable
    34  def list_objects(
    35      client: Client, urls_list: List[str], ais_source_list: List[AISSource]
    36  ) -> List[Object]:
    37      """
    38      Create list of all the objects in the given urls and AISSources
    39  
    40      Args:
    41          client (Client): AIStore client object of the calling method
    42          urls_list (List[str]): list of urls
    43          ais_source_list (AISSource, List[AISSource]): list of AISSource objects to load data
    44  
    45      Returns:
    46          List[Object]: list of all the objects in the given urls and AISSources
    47      """
    48      samples = []
    49      for item in urls_list:
    50          provider, bck_name, path = parse_url(item)
    51          objects_iter = client.bucket(
    52              bck_name=bck_name, provider=provider
    53          ).list_all_objects_iter(prefix=path)
    54          for obj in objects_iter:
    55              samples.append(obj)
    56  
    57      for item in ais_source_list:
    58          for obj in item.list_all_objects_iter():
    59              samples.append(obj)
    60  
    61      return samples
    62  
    63  
    64  def unparse_url(provider: str, bck_name: str, obj_name: str) -> str:
    65      """
    66      To generate URL based on provider, bck_name and object name
    67      Args:
    68          provider(str): Provider name ('ais', 'gcp', etc)
    69          bck_name(str): Bucket name
    70          obj_name(str): Object name with extension.
    71      Returns:
    72          unparsed_url(str): Unparsed url (complete url)
    73      """
    74      return urlunparse([provider, bck_name, obj_name, "", "", ""])
    75  
    76  
    77  def list_objects_iterator(
    78      client: Client,
    79      urls_list: List[str],
    80      ais_source_list: List[AISSource],
    81  ) -> Iterable[Object]:
    82      """
    83      Create an iterable over all the objects in the given urls and AISSources
    84  
    85      Args:
    86          client (Client): AIStore client object of the calling method
    87          urls_list (List[str]): list of urls
    88          ais_source_list (AISSource, List[AISSource]): list of AISSource objects to load data
    89  
    90      Returns:
    91          Iterable[Object]: iterable over all the objects in the given urls and AISSources
    92      """
    93      for item in urls_list:
    94          provider, bck_name, path = parse_url(item)
    95          objects_iter = client.bucket(
    96              bck_name=bck_name, provider=provider
    97          ).list_all_objects_iter(prefix=path)
    98          for obj in objects_iter:
    99              yield obj
   100  
   101      for item in ais_source_list:
   102          for obj in item.list_all_objects_iter():
   103              yield obj