github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/python/aistore/sdk/multiobj/object_group.py (about)

     1  #
     2  # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
     3  #
     4  import logging
     5  from typing import List, Iterable
     6  
     7  from aistore.sdk.ais_source import AISSource
     8  from aistore.sdk.const import (
     9      HTTP_METHOD_DELETE,
    10      HTTP_METHOD_POST,
    11      HTTP_METHOD_PUT,
    12      ACT_DELETE_OBJECTS,
    13      ACT_PREFETCH_OBJECTS,
    14      ACT_EVICT_OBJECTS,
    15      ACT_COPY_OBJECTS,
    16      ACT_TRANSFORM_OBJECTS,
    17      ACT_ARCHIVE_OBJECTS,
    18  )
    19  from aistore.sdk.etl_const import DEFAULT_ETL_TIMEOUT
    20  from aistore.sdk.object import Object
    21  from aistore.sdk.multiobj.object_names import ObjectNames
    22  from aistore.sdk.multiobj.object_range import ObjectRange
    23  from aistore.sdk.multiobj.object_template import ObjectTemplate
    24  from aistore.sdk.types import (
    25      TCMultiObj,
    26      CopyBckMsg,
    27      TransformBckMsg,
    28      TCBckMsg,
    29      ArchiveMultiObj,
    30      PrefetchMsg,
    31  )
    32  
    33  
    34  # pylint: disable=unused-variable
    35  class ObjectGroup(AISSource):
    36      """
    37      A class representing multiple objects within the same bucket. Only one of obj_names, obj_range, or obj_template
    38       should be provided.
    39  
    40      Args:
    41          bck (Bucket): Bucket the objects belong to
    42          obj_names (list[str], optional): List of object names to include in this collection
    43          obj_range (ObjectRange, optional): Range defining which object names in the bucket should be included
    44          obj_template (str, optional): String argument to pass as template value directly to api
    45      """
    46  
    47      def __init__(
    48          self,
    49          bck: "Bucket",
    50          obj_names: list = None,
    51          obj_range: ObjectRange = None,
    52          obj_template: str = None,
    53      ):
    54          self.bck = bck
    55          num_args = sum(
    56              1 if x is not None else 0 for x in [obj_names, obj_range, obj_template]
    57          )
    58          if num_args != 1:
    59              raise ValueError(
    60                  "ObjectGroup accepts one and only one of: obj_names, obj_range, or obj_template"
    61              )
    62          if obj_range and not isinstance(obj_range, ObjectRange):
    63              raise TypeError("obj_range must be of type ObjectRange")
    64  
    65          if obj_range:
    66              self._obj_collection = obj_range
    67          elif obj_names:
    68              self._obj_collection = ObjectNames(obj_names)
    69          else:
    70              self._obj_collection = ObjectTemplate(obj_template)
    71  
    72      def list_urls(self, prefix: str = "", etl_name: str = None) -> Iterable[str]:
    73          """
    74          Implementation of the abstract method from AISSource that provides an iterator
    75          of full URLs to every object in this bucket matching the specified prefix
    76          Args:
    77              prefix (str, optional): Limit objects selected by a given string prefix
    78              etl_name (str, optional): ETL to include in URLs
    79  
    80          Returns:
    81              Iterator of all object URLs in the group
    82          """
    83          for obj_name in self._obj_collection:
    84              yield self.bck.object(obj_name).get_url(etl_name=etl_name)
    85  
    86      def list_all_objects_iter(self, prefix: str = "") -> Iterable[Object]:
    87          """
    88          Implementation of the abstract method from AISSource that provides an iterator
    89          of all the objects in this bucket matching the specified prefix
    90  
    91          Args:
    92              prefix (str, optional): Limit objects selected by a given string prefix
    93  
    94          Returns:
    95              Iterator of all the objects in the group
    96          """
    97          for obj_name in self._obj_collection:
    98              yield self.bck.object(obj_name)
    99  
   100      def delete(self):
   101          """
   102          Deletes a list or range of objects in a bucket
   103  
   104          Raises:
   105              aistore.sdk.errors.AISError: All other types of errors with AIStore
   106              requests.ConnectionError: Connection error
   107              requests.ConnectionTimeout: Timed out connecting to AIStore
   108              requests.exceptions.HTTPError: Service unavailable
   109              requests.RequestException: "There was an ambiguous exception that occurred while handling..."
   110              requests.ReadTimeout: Timed out receiving response from AIStore
   111  
   112          Returns:
   113              Job ID (as str) that can be used to check the status of the operation
   114  
   115          """
   116  
   117          return self.bck.make_request(
   118              HTTP_METHOD_DELETE,
   119              ACT_DELETE_OBJECTS,
   120              value=self._obj_collection.get_value(),
   121          ).text
   122  
   123      def evict(self):
   124          """
   125          Evicts a list or range of objects in a bucket so that they are no longer cached in AIS
   126          NOTE: only Cloud buckets can be evicted.
   127  
   128          Raises:
   129              aistore.sdk.errors.AISError: All other types of errors with AIStore
   130              requests.ConnectionError: Connection error
   131              requests.ConnectionTimeout: Timed out connecting to AIStore
   132              requests.exceptions.HTTPError: Service unavailable
   133              requests.RequestException: "There was an ambiguous exception that occurred while handling..."
   134              requests.ReadTimeout: Timed out receiving response from AIStore
   135  
   136          Returns:
   137              Job ID (as str) that can be used to check the status of the operation
   138  
   139          """
   140          self.bck.verify_cloud_bucket()
   141          return self.bck.make_request(
   142              HTTP_METHOD_DELETE,
   143              ACT_EVICT_OBJECTS,
   144              value=self._obj_collection.get_value(),
   145          ).text
   146  
   147      def prefetch(
   148          self,
   149          blob_threshold: int = None,
   150          latest: bool = False,
   151          continue_on_error: bool = False,
   152      ):
   153          """
   154          Prefetches a list or range of objects in a bucket so that they are cached in AIS
   155          NOTE: only Cloud buckets can be prefetched.
   156  
   157          Args:
   158              latest (bool, optional): GET the latest object version from the associated remote bucket
   159              continue_on_error (bool, optional): Whether to continue if there is an error prefetching a single object
   160              blob_threshold (int, optional): Utilize built-in blob-downloader for remote objects
   161                  greater than the specified (threshold) size in bytes
   162  
   163          Raises:
   164              aistore.sdk.errors.AISError: All other types of errors with AIStore
   165              requests.ConnectionError: Connection error
   166              requests.ConnectionTimeout: Timed out connecting to AIStore
   167              requests.exceptions.HTTPError: Service unavailable
   168              requests.RequestException: "There was an ambiguous exception that occurred while handling..."
   169              requests.ReadTimeout: Timed out receiving response from AIStore
   170  
   171          Returns:
   172              Job ID (as str) that can be used to check the status of the operation
   173  
   174          """
   175          self.bck.verify_cloud_bucket()
   176  
   177          value = PrefetchMsg(
   178              object_selection=self._obj_collection.get_value(),
   179              continue_on_err=continue_on_error,
   180              latest=latest,
   181              blob_threshold=blob_threshold,
   182          ).as_dict()
   183  
   184          return self.bck.make_request(
   185              HTTP_METHOD_POST,
   186              ACT_PREFETCH_OBJECTS,
   187              value=value,
   188          ).text
   189  
   190      # pylint: disable=too-many-arguments
   191      def copy(
   192          self,
   193          to_bck: "Bucket",
   194          prepend: str = "",
   195          continue_on_error: bool = False,
   196          dry_run: bool = False,
   197          force: bool = False,
   198          latest: bool = False,
   199          sync: bool = False,
   200      ):
   201          """
   202          Copies a list or range of objects in a bucket
   203  
   204          Args:
   205              to_bck (Bucket): Destination bucket
   206              prepend (str, optional): Value to prepend to the name of copied objects
   207              continue_on_error (bool, optional): Whether to continue if there is an error copying a single object
   208              dry_run (bool, optional): Skip performing the copy and just log the intended actions
   209              force (bool, optional): Force this job to run over others in case it conflicts
   210                  (see "limited coexistence" and xact/xreg/xreg.go)
   211              latest (bool, optional): GET the latest object version from the associated remote bucket
   212              sync (bool, optional): synchronize destination bucket with its remote (e.g., Cloud or remote AIS) source
   213  
   214          Raises:
   215              aistore.sdk.errors.AISError: All other types of errors with AIStore
   216              requests.ConnectionError: Connection error
   217              requests.ConnectionTimeout: Timed out connecting to AIStore
   218              requests.exceptions.HTTPError: Service unavailable
   219              requests.RequestException: "There was an ambiguous exception that occurred while handling..."
   220              requests.ReadTimeout: Timed out receiving response from AIStore
   221  
   222          Returns:
   223              Job ID (as str) that can be used to check the status of the operation
   224  
   225          """
   226          if dry_run:
   227              logger = logging.getLogger(f"{__name__}.copy")
   228              logger.info(
   229                  "Copy dry-run. Running with dry_run=False will copy the following objects from bucket '%s' to '%s': %s",
   230                  f"{self.bck.get_path()}",
   231                  f"{to_bck.get_path()}",
   232                  list(self._obj_collection),
   233              )
   234          copy_msg = CopyBckMsg(
   235              prepend=prepend, dry_run=dry_run, force=force, latest=latest, sync=sync
   236          )
   237  
   238          value = TCMultiObj(
   239              to_bck=to_bck.as_model(),
   240              tc_msg=TCBckMsg(copy_msg=copy_msg),
   241              object_selection=self._obj_collection.get_value(),
   242              continue_on_err=continue_on_error,
   243          ).as_dict()
   244  
   245          return self.bck.make_request(
   246              HTTP_METHOD_POST,
   247              ACT_COPY_OBJECTS,
   248              value=value,
   249          ).text
   250  
   251      # pylint: disable=too-many-arguments
   252      def transform(
   253          self,
   254          to_bck: "Bucket",
   255          etl_name: str,
   256          timeout: str = DEFAULT_ETL_TIMEOUT,
   257          prepend: str = "",
   258          continue_on_error: bool = False,
   259          dry_run: bool = False,
   260          force: bool = False,
   261          latest: bool = False,
   262          sync: bool = False,
   263      ):
   264          """
   265          Performs ETL operation on a list or range of objects in a bucket, placing the results in the destination bucket
   266  
   267          Args:
   268              to_bck (Bucket): Destination bucket
   269              etl_name (str): Name of existing ETL to apply
   270              timeout (str): Timeout of the ETL job (e.g. 5m for 5 minutes)
   271              prepend (str, optional): Value to prepend to the name of resulting transformed objects
   272              continue_on_error (bool, optional): Whether to continue if there is an error transforming a single object
   273              dry_run (bool, optional): Skip performing the transform and just log the intended actions
   274              force (bool, optional): Force this job to run over others in case it conflicts
   275                  (see "limited coexistence" and xact/xreg/xreg.go)
   276              latest (bool, optional): GET the latest object version from the associated remote bucket
   277              sync (bool, optional): synchronize destination bucket with its remote (e.g., Cloud or remote AIS) source
   278  
   279          Raises:
   280              aistore.sdk.errors.AISError: All other types of errors with AIStore
   281              requests.ConnectionError: Connection error
   282              requests.ConnectionTimeout: Timed out connecting to AIStore
   283              requests.exceptions.HTTPError: Service unavailable
   284              requests.RequestException: "There was an ambiguous exception that occurred while handling..."
   285              requests.ReadTimeout: Timed out receiving response from AIStore
   286  
   287          Returns:
   288              Job ID (as str) that can be used to check the status of the operation
   289  
   290          """
   291          if dry_run:
   292              logger = logging.getLogger(f"{__name__}.transform")
   293              logger.info(
   294                  "Transform dry-run. Running with dry_run=False will apply ETL '%s' to objects %s",
   295                  etl_name,
   296                  list(self._obj_collection),
   297              )
   298  
   299          copy_msg = CopyBckMsg(
   300              prepend=prepend, dry_run=dry_run, force=force, latest=latest, sync=sync
   301          )
   302          transform_msg = TransformBckMsg(etl_name=etl_name, timeout=timeout)
   303          value = TCMultiObj(
   304              to_bck=to_bck.as_model(),
   305              tc_msg=TCBckMsg(transform_msg=transform_msg, copy_msg=copy_msg),
   306              object_selection=self._obj_collection.get_value(),
   307              continue_on_err=continue_on_error,
   308          ).as_dict()
   309          return self.bck.make_request(
   310              HTTP_METHOD_POST, ACT_TRANSFORM_OBJECTS, value=value
   311          ).text
   312  
   313      def archive(
   314          self,
   315          archive_name: str,
   316          mime: str = "",
   317          to_bck: "Bucket" = None,
   318          include_source_name: bool = False,
   319          allow_append: bool = False,
   320          continue_on_err: bool = False,
   321      ):
   322          """
   323          Create or append to an archive
   324  
   325          Args:
   326              archive_name (str): Name of archive to create or append
   327              mime (str, optional): MIME type of the content
   328              to_bck (Bucket, optional): Destination bucket, defaults to current bucket
   329              include_source_name (bool, optional): Include the source bucket name in the archived objects' names
   330              allow_append (bool, optional): Allow appending to an existing archive
   331              continue_on_err (bool, optional): Whether to continue if there is an error archiving a single object
   332  
   333          Returns:
   334              Job ID (as str) that can be used to check the status of the operation
   335  
   336          """
   337          val = ArchiveMultiObj(
   338              object_selection=self._obj_collection.get_value(),
   339              archive_name=archive_name,
   340              mime=mime,
   341              to_bck=to_bck.as_model() if to_bck else self.bck.as_model(),
   342              include_source_name=include_source_name,
   343              allow_append=allow_append,
   344              continue_on_err=continue_on_err,
   345          ).as_dict()
   346          return self.bck.make_request(
   347              HTTP_METHOD_PUT, ACT_ARCHIVE_OBJECTS, value=val
   348          ).text
   349  
   350      def list_names(self) -> List[str]:
   351          """
   352          List all the object names included in this group of objects
   353  
   354          Returns:
   355              List of object names
   356  
   357          """
   358          return list(self._obj_collection)