github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/python/tests/integration/sdk/test_object_group_ops.py (about)

     1  #
     2  # Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
     3  #
     4  import hashlib
     5  import unittest
     6  import tarfile
     7  import io
     8  from datetime import datetime
     9  
    10  import pytest
    11  
    12  from aistore.sdk.const import PROVIDER_AIS, LOREM, DUIS
    13  from aistore.sdk.errors import InvalidBckProvider, AISError, JobInfoNotFound
    14  from tests.const import (
    15      SMALL_FILE_SIZE,
    16      MIB,
    17      OBJECT_COUNT,
    18      TEST_TIMEOUT,
    19      PREFIX_NAME,
    20      SUFFIX_NAME,
    21  )
    22  from tests.integration import REMOTE_SET
    23  from tests.integration.sdk.remote_enabled_test import RemoteEnabledTest
    24  from tests.utils import random_string
    25  
    26  
    27  # pylint: disable=unused-variable,too-many-instance-attributes
    28  class TestObjectGroupOps(RemoteEnabledTest):
    29      def setUp(self) -> None:
    30          super().setUp()
    31          self.suffix = SUFFIX_NAME
    32          self.obj_names = self._create_objects(suffix=self.suffix)
    33          if REMOTE_SET:
    34              self.s3_client = self._get_boto3_client()
    35  
    36      def test_delete(self):
    37          object_group = self.bucket.objects(obj_names=self.obj_names[1:])
    38          job_id = object_group.delete()
    39          self.client.job(job_id).wait(timeout=TEST_TIMEOUT)
    40          existing_objects = self.bucket.list_objects(prefix=self.obj_prefix).entries
    41          self.assertEqual(1, len(existing_objects))
    42          self.assertEqual(self.obj_names[0], existing_objects[0].name)
    43  
    44      @unittest.skipIf(
    45          not REMOTE_SET,
    46          "Remote bucket is not set",
    47      )
    48      def test_evict(self):
    49          object_group = self.bucket.objects(obj_names=self.obj_names[1:])
    50          job_id = object_group.evict()
    51          self.client.job(job_id).wait(timeout=TEST_TIMEOUT)
    52          self._verify_cached_objects(OBJECT_COUNT, [0])
    53  
    54      def test_evict_objects_local(self):
    55          local_bucket = self.client.bucket(random_string(), provider=PROVIDER_AIS)
    56          with self.assertRaises(InvalidBckProvider):
    57              local_bucket.objects(obj_names=[]).evict()
    58  
    59      @unittest.skipIf(
    60          not REMOTE_SET,
    61          "Remote bucket is not set",
    62      )
    63      def test_prefetch_list(self):
    64          obj_group = self.bucket.objects(obj_names=self.obj_names[1:])
    65          self._evict_all_objects()
    66          # Fetch back a specific object group and verify cache status
    67          job_id = obj_group.prefetch()
    68          self.client.job(job_id).wait(timeout=TEST_TIMEOUT * 2)
    69          self._verify_cached_objects(OBJECT_COUNT, range(1, OBJECT_COUNT))
    70  
    71      @unittest.skipIf(
    72          not REMOTE_SET,
    73          "Remote bucket is not set",
    74      )
    75      def test_prefetch_blob_download(self):
    76          obj_name = self.obj_prefix + str(OBJECT_COUNT) + self.suffix
    77          obj_names = self._create_objects(obj_names=[obj_name], obj_size=SMALL_FILE_SIZE)
    78          self.obj_names.extend(obj_names)
    79          obj_group = self.bucket.objects(obj_names=obj_names)
    80          self._evict_all_objects(num_obj=OBJECT_COUNT + 1)
    81          start_time = datetime.now().time()
    82          job_id = obj_group.prefetch(blob_threshold=2 * MIB)
    83          self.client.job(job_id=job_id).wait(timeout=TEST_TIMEOUT * 2)
    84          end_time = datetime.now().time()
    85          jobs_list = self.client.job(job_kind="blob-download").get_within_timeframe(
    86              start_time=start_time, end_time=end_time
    87          )
    88          self.assertTrue(len(jobs_list) > 0)
    89          self._verify_cached_objects(
    90              OBJECT_COUNT + 1, range(OBJECT_COUNT, OBJECT_COUNT + 1)
    91          )
    92  
    93      @unittest.skipIf(
    94          not REMOTE_SET,
    95          "Remote bucket is not set",
    96      )
    97      def test_prefetch_without_blob_download(self):
    98          obj_name = self.obj_prefix + str(OBJECT_COUNT) + self.suffix
    99          obj_names = self._create_objects(obj_names=[obj_name], obj_size=SMALL_FILE_SIZE)
   100          self.obj_names.extend(obj_names)
   101          obj_group = self.bucket.objects(obj_names=obj_names)
   102          self._evict_all_objects(num_obj=OBJECT_COUNT + 1)
   103          start_time = datetime.now().time()
   104          job_id = obj_group.prefetch(blob_threshold=2 * SMALL_FILE_SIZE)
   105          self.client.job(job_id=job_id).wait(timeout=TEST_TIMEOUT * 2)
   106          end_time = datetime.now().time()
   107  
   108          with self.assertRaises(JobInfoNotFound):
   109              self.client.job(job_kind="blob-download").get_within_timeframe(
   110                  start_time=start_time, end_time=end_time
   111              )
   112  
   113          self._verify_cached_objects(
   114              OBJECT_COUNT + 1, range(OBJECT_COUNT, OBJECT_COUNT + 1)
   115          )
   116  
   117      def test_prefetch_objects_local(self):
   118          local_bucket = self.client.bucket(random_string(), provider=PROVIDER_AIS)
   119          with self.assertRaises(InvalidBckProvider):
   120              local_bucket.objects(obj_names=[]).prefetch()
   121  
   122      def test_copy_objects(self):
   123          to_bck_name = "destination-bucket"
   124          to_bck = self._create_bucket(to_bck_name)
   125          self.assertEqual(0, len(to_bck.list_all_objects(prefix=self.obj_prefix)))
   126          self.assertEqual(
   127              OBJECT_COUNT, len(self.bucket.list_all_objects(prefix=self.obj_prefix))
   128          )
   129  
   130          new_prefix = PREFIX_NAME
   131          copy_job = self.bucket.objects(obj_names=self.obj_names[1:5]).copy(
   132              to_bck, prepend=new_prefix
   133          )
   134          self.client.job(job_id=copy_job).wait_for_idle(timeout=TEST_TIMEOUT)
   135  
   136          self.assertEqual(
   137              4, len(to_bck.list_all_objects(prefix=new_prefix + self.obj_prefix))
   138          )
   139  
   140      @unittest.skipIf(
   141          not REMOTE_SET,
   142          "Remote bucket is not set",
   143      )
   144      def test_copy_objects_latest_flag(self):
   145          obj_name = random_string()
   146          self._register_for_post_test_cleanup(names=[obj_name], is_bucket=False)
   147          to_bck_name = "dst-bck-cp-latest"
   148          to_bck = self._create_bucket(to_bck_name)
   149  
   150          # out-of-band PUT: first version
   151          self.s3_client.put_object(Bucket=self.bucket.name, Key=obj_name, Body=LOREM)
   152  
   153          # copy, and check
   154          self._copy_and_check_with_latest(self.bucket, to_bck, obj_name, LOREM, False)
   155          # create a cached copy in src bucket
   156          content = self.bucket.object(obj_name).get().read_all()
   157          self.assertEqual(LOREM, content.decode("utf-8"))
   158  
   159          # out-of-band PUT: 2nd version (overwrite)
   160          self.s3_client.put_object(Bucket=self.bucket.name, Key=obj_name, Body=DUIS)
   161  
   162          # copy and check (expecting the first version)
   163          self._copy_and_check_with_latest(self.bucket, to_bck, obj_name, LOREM, False)
   164  
   165          # copy latest: update in-cluster copy
   166          self._copy_and_check_with_latest(self.bucket, to_bck, obj_name, DUIS, True)
   167          # check if cached copy is src bck is still on prev version
   168          content = self.bucket.object(obj_name).get().read_all()
   169          self.assertEqual(LOREM, content.decode("utf-8"))
   170  
   171          # out-of-band DELETE
   172          self.s3_client.delete_object(Bucket=self.bucket.name, Key=obj_name)
   173  
   174          # copy and check (expecting no changes)
   175          self._copy_and_check_with_latest(self.bucket, to_bck, obj_name, DUIS, True)
   176  
   177          # run copy with '--sync' one last time, and make sure the object "disappears"
   178          copy_job = self.bucket.objects(obj_names=[obj_name]).copy(
   179              self.bucket, sync=True
   180          )
   181          self.client.job(job_id=copy_job).wait_for_idle(timeout=TEST_TIMEOUT)
   182          with self.assertRaises(AISError):
   183              self.bucket.object(obj_name).get()
   184  
   185      @unittest.skipIf(
   186          not REMOTE_SET,
   187          "Remote bucket is not set",
   188      )
   189      def test_copy_objects_sync_flag(self):
   190          to_bck_name = "dst-bck-cp-sync"
   191          to_bck = self._create_bucket(to_bck_name)
   192  
   193          # run copy with '--sync' on different dst, and make sure the object "disappears"
   194          # multi-obj --sync currently only supports templates
   195          # TODO: add test for multi-obj list --sync once api is ready
   196          template = self.obj_prefix + "{0..10}" + self.suffix
   197          copy_job = self.bucket.objects(obj_template=template).copy(to_bck)
   198          self.client.job(job_id=copy_job).wait_for_idle(timeout=TEST_TIMEOUT)
   199          self.assertEqual(
   200              len(to_bck.list_all_objects(prefix=self.obj_prefix)), OBJECT_COUNT
   201          )
   202  
   203          prefetch_job = self.bucket.objects(obj_template=template).prefetch()
   204          self.client.job(job_id=prefetch_job).wait_for_idle(timeout=TEST_TIMEOUT)
   205  
   206          # out of band delete all objects
   207          for obj_name in self.obj_names:
   208              self.s3_client.delete_object(Bucket=self.bucket.name, Key=obj_name)
   209  
   210          copy_job = self.bucket.objects(obj_template=template).copy(to_bck, sync=True)
   211          self.client.job(job_id=copy_job).wait_for_idle(timeout=TEST_TIMEOUT)
   212          # check to see if all the objects in dst disapear after cp multi-obj sync
   213          self.assertEqual(len(to_bck.list_all_objects(prefix=self.obj_prefix)), 0)
   214          # objects also disapear from src bck
   215          self.assertEqual(len(self.bucket.list_all_objects(prefix=self.obj_prefix)), 0)
   216  
   217      @unittest.skipIf(
   218          not REMOTE_SET,
   219          "Remote bucket is not set",
   220      )
   221      def test_prefetch_objects_latest_flag(self):
   222          obj_name = random_string()
   223          self._register_for_post_test_cleanup(names=[obj_name], is_bucket=False)
   224  
   225          # out-of-band PUT: first version
   226          self.s3_client.put_object(Bucket=self.bucket.name, Key=obj_name, Body=LOREM)
   227  
   228          # prefetch, and check
   229          self._prefetch_and_check_with_latest(self.bucket, obj_name, LOREM, False)
   230  
   231          # out-of-band PUT: 2nd version (overwrite)
   232          self.s3_client.put_object(Bucket=self.bucket.name, Key=obj_name, Body=DUIS)
   233  
   234          # prefetch and check (expecting the first version)
   235          self._prefetch_and_check_with_latest(self.bucket, obj_name, LOREM, False)
   236  
   237          # prefetch latest: update in-cluster copy
   238          self._prefetch_and_check_with_latest(self.bucket, obj_name, DUIS, True)
   239  
   240          # out-of-band DELETE
   241          self.s3_client.delete_object(Bucket=self.bucket.name, Key=obj_name)
   242  
   243          # prefetch without '--latest': expecting no changes
   244          self._prefetch_and_check_with_latest(self.bucket, obj_name, DUIS, False)
   245  
   246          # run prefetch with '--latest' one last time, and make sure the object "disappears"
   247          prefetch_job = self.bucket.objects(obj_names=[obj_name]).prefetch(latest=True)
   248          self.client.job(job_id=prefetch_job).wait_for_idle(timeout=TEST_TIMEOUT)
   249          with self.assertRaises(AISError):
   250              self.bucket.object(obj_name).get()
   251  
   252      def _prefetch_and_check_with_latest(self, bucket, obj_name, expected, latest_flag):
   253          prefetch_job = bucket.objects(obj_names=[obj_name]).prefetch(latest=latest_flag)
   254          self.client.job(job_id=prefetch_job).wait_for_idle(timeout=TEST_TIMEOUT)
   255  
   256          content = bucket.object(obj_name).get().read_all()
   257          self.assertEqual(expected, content.decode("utf-8"))
   258  
   259      # pylint: disable=too-many-arguments
   260      def _copy_and_check_with_latest(
   261          self, from_bck, to_bck, obj_name, expected, latest_flag
   262      ):
   263          copy_job = from_bck.objects(obj_names=[obj_name]).copy(
   264              to_bck, latest=latest_flag
   265          )
   266          self.client.job(job_id=copy_job).wait_for_idle(timeout=TEST_TIMEOUT)
   267          self.assertEqual(1, len(to_bck.list_all_objects()))
   268          content = to_bck.object(obj_name).get().read_all()
   269          self.assertEqual(expected, content.decode("utf-8"))
   270  
   271      def test_archive_objects_without_copy(self):
   272          arch_name = self.obj_prefix + "-archive-without-copy.tar"
   273          self._archive_exec_assert(arch_name, self.bucket, self.bucket)
   274  
   275      def test_archive_objects_with_copy(self):
   276          arch_name = self.obj_prefix + "-archive-with-copy.tar"
   277          dest_bck = self._create_bucket(random_string())
   278          self._archive_exec_assert(arch_name, self.bucket, dest_bck, to_bck=dest_bck)
   279  
   280      def _archive_exec_assert(self, arch_name, src_bck, res_bck, **kwargs):
   281          # Add to object list to clean up on test finish
   282          if res_bck.provider != PROVIDER_AIS:
   283              self._register_for_post_test_cleanup(names=[arch_name], is_bucket=False)
   284          archived_names = self.obj_names[1:5]
   285          expected_contents = {}
   286          for name in archived_names:
   287              expected_contents[name] = src_bck.object(obj_name=name).get().read_all()
   288  
   289          arch_job = src_bck.objects(obj_names=archived_names).archive(
   290              archive_name=arch_name, **kwargs
   291          )
   292          self.client.job(job_id=arch_job).wait_for_idle(timeout=TEST_TIMEOUT)
   293  
   294          # Read the tar archive and assert the object names and contents match
   295          res_bytes = res_bck.object(arch_name).get().read_all()
   296          with tarfile.open(fileobj=io.BytesIO(res_bytes), mode="r") as tar:
   297              member_names = []
   298              for member in tar.getmembers():
   299                  inner_file = tar.extractfile(member)
   300                  self.assertEqual(expected_contents[member.name], inner_file.read())
   301                  inner_file.close()
   302                  member_names.append(member.name)
   303              self.assertEqual(set(archived_names), set(member_names))
   304  
   305      @pytest.mark.etl
   306      def test_transform_objects(self):
   307          # Define an etl with code that hashes the contents of each object
   308          etl_name = "etl-" + random_string(5)
   309  
   310          def transform(input_bytes):
   311              md5 = hashlib.md5()
   312              md5.update(input_bytes)
   313              return md5.hexdigest().encode()
   314  
   315          md5_etl = self.client.etl(etl_name)
   316          md5_etl.init_code(transform=transform)
   317  
   318          to_bck_name = "destination-bucket"
   319          to_bck = self._create_bucket(to_bck_name)
   320          new_prefix = PREFIX_NAME
   321          self.assertEqual(0, len(to_bck.list_all_objects(prefix=self.obj_prefix)))
   322          self.assertEqual(
   323              OBJECT_COUNT, len(self.bucket.list_all_objects(prefix=self.obj_prefix))
   324          )
   325  
   326          transform_job = self.bucket.objects(obj_names=self.obj_names).transform(
   327              to_bck, etl_name=md5_etl.name, prepend=new_prefix
   328          )
   329          self.client.job(job_id=transform_job).wait_for_idle(timeout=TEST_TIMEOUT)
   330  
   331          # Get the md5 transform of each source object and verify the destination bucket contains those results
   332          from_obj_hashes = [
   333              transform(self.bucket.object(name).get().read_all())
   334              for name in self.obj_names
   335          ]
   336          to_obj_values = [
   337              to_bck.object(new_prefix + name).get().read_all() for name in self.obj_names
   338          ]
   339          self.assertEqual(to_obj_values, from_obj_hashes)
   340  
   341      def _evict_all_objects(self, num_obj=OBJECT_COUNT):
   342          job_id = self.bucket.objects(obj_names=self.obj_names).evict()
   343          self.client.job(job_id).wait(timeout=TEST_TIMEOUT)
   344          self._check_all_objects_cached(num_obj, expected_cached=False)