github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/python/tests/integration/sdk/test_bucket_ops.py (about)

     1  #
     2  # Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved.
     3  #
     4  import unittest
     5  from pathlib import Path
     6  
     7  import boto3
     8  
     9  import requests
    10  
    11  from aistore.sdk import ListObjectFlag
    12  from aistore.sdk.const import UTF_ENCODING, LOREM, DUIS
    13  from aistore.sdk.dataset.dataset_config import DatasetConfig
    14  from aistore.sdk.dataset.data_attribute import DataAttribute
    15  from aistore.sdk.dataset.label_attribute import LabelAttribute
    16  from aistore.sdk.errors import InvalidBckProvider, AISError, ErrBckNotFound
    17  
    18  from tests.integration.sdk.remote_enabled_test import RemoteEnabledTest
    19  from tests import AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY
    20  from tests.integration.boto3 import AWS_REGION
    21  
    22  from tests.utils import random_string, cleanup_local, test_cases
    23  from tests.const import OBJ_NAME, OBJECT_COUNT, OBJ_CONTENT, PREFIX_NAME
    24  from tests.integration import REMOTE_SET
    25  
    26  INNER_DIR = "directory"
    27  DATASET_DIR = "dataset"
    28  TOP_LEVEL_FILES = {
    29      "top_level_file.txt": b"test data to verify",
    30      "other_top_level_file.txt": b"other file test data to verify",
    31  }
    32  LOWER_LEVEL_FILES = {"lower_level_file.txt": b"data in inner file"}
    33  
    34  
    35  def _create_files(folder, file_dict):
    36      for filename, data in file_dict.items():
    37          lower_file = folder.joinpath(filename)
    38          with open(lower_file, "wb") as file:
    39              file.write(data)
    40  
    41  
    42  # pylint: disable=unused-variable, too-many-public-methods
    43  class TestBucketOps(RemoteEnabledTest):
    44      def setUp(self) -> None:
    45          super().setUp()
    46          self.local_test_files = (
    47              Path().absolute().joinpath("bucket-ops-test-" + random_string(8))
    48          )
    49  
    50      def tearDown(self) -> None:
    51          super().tearDown()
    52          cleanup_local(str(self.local_test_files))
    53  
    54      def _create_put_files_structure(self, top_level_files, lower_level_files):
    55          self.local_test_files.mkdir(exist_ok=True)
    56          _create_files(self.local_test_files, top_level_files)
    57          inner_dir = self.local_test_files.joinpath(INNER_DIR)
    58          inner_dir.mkdir()
    59          _create_files(inner_dir, lower_level_files)
    60  
    61      def test_bucket(self):
    62          new_bck_name = random_string(10)
    63          self._create_bucket(new_bck_name)
    64          res = self.client.cluster().list_buckets()
    65          bucket_names = {bck.name for bck in res}
    66          self.assertIn(new_bck_name, bucket_names)
    67  
    68      @test_cases(
    69          "*", ".", "", " ", "bucket/name", "bucket and name", "#name", "$name", "~name"
    70      )
    71      def test_create_bucket_invalid_name(self, testcase):
    72          with self.assertRaises(AISError):
    73              self._create_bucket(testcase)
    74  
    75      def test_bucket_invalid_name(self):
    76          with self.assertRaises(ErrBckNotFound):
    77              self.client.bucket("INVALID_BCK_NAME").list_objects()
    78  
    79      def test_bucket_invalid_aws_name(self):
    80          with self.assertRaises(AISError):
    81              self.client.bucket("INVALID_BCK_NAME", "aws").list_objects()
    82  
    83      def test_head(self):
    84          try:
    85              self.bucket.head()
    86          except requests.exceptions.HTTPError as err:
    87              self.assertEqual(err.response.status_code, 404)
    88  
    89      def test_rename(self):
    90          from_bck_name = self.bck_name + "from"
    91          to_bck_name = self.bck_name + "to"
    92          from_bck = self._create_bucket(from_bck_name)
    93          self.client.cluster().list_buckets()
    94  
    95          self.assertEqual(from_bck_name, from_bck.name)
    96          job_id = from_bck.rename(to_bck_name=to_bck_name)
    97          self.assertNotEqual(job_id, "")
    98  
    99          # wait for rename to finish
   100          self.client.job(job_id).wait()
   101  
   102          # new bucket should be created and accessible
   103          to_bck = self.client.bucket(to_bck_name)
   104          to_bck.head()
   105          self.assertEqual(to_bck_name, to_bck.name)
   106  
   107          # old bucket should be inaccessible
   108          try:
   109              from_bck.head()
   110          except requests.exceptions.HTTPError as err:
   111              self.assertEqual(err.response.status_code, 404)
   112          self._register_for_post_test_cleanup(names=[to_bck_name], is_bucket=True)
   113  
   114      def test_copy(self):
   115          from_bck_name = self.bck_name + "from"
   116          to_bck_name = self.bck_name + "to"
   117          from_bck = self._create_bucket(from_bck_name)
   118          to_bck = self._create_bucket(to_bck_name)
   119          prefix = PREFIX_NAME
   120          new_prefix = "new-"
   121          content = b"test"
   122          expected_name = prefix + "-obj"
   123          from_bck.object(expected_name).put_content(content)
   124          from_bck.object("notprefix-obj").put_content(content)
   125  
   126          job_id = from_bck.copy(to_bck, prefix_filter=prefix, prepend=new_prefix)
   127  
   128          self.assertNotEqual(job_id, "")
   129          self.client.job(job_id).wait()
   130          entries = to_bck.list_all_objects()
   131          self.assertEqual(1, len(entries))
   132          self.assertEqual(new_prefix + expected_name, entries[0].name)
   133  
   134      @unittest.skipIf(
   135          not REMOTE_SET,
   136          "Remote bucket is not set",
   137      )
   138      def test_get_latest_flag(self):
   139          obj_name = random_string()
   140          self._register_for_post_test_cleanup(names=[obj_name], is_bucket=False)
   141  
   142          s3_client = boto3.client(
   143              "s3",
   144              region_name=AWS_REGION,
   145              aws_access_key_id=AWS_ACCESS_KEY_ID,
   146              aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
   147              # aws_session_token=AWS_SESSION_TOKEN,
   148          )
   149  
   150          # out-of-band PUT: first version
   151          s3_client.put_object(Bucket=self.bucket.name, Key=obj_name, Body=LOREM)
   152  
   153          # cold GET, and check
   154          content = self.bucket.object(obj_name).get().read_all()
   155          self.assertEqual(LOREM, content.decode("utf-8"))
   156  
   157          # out-of-band PUT: 2nd version (overwrite)
   158          s3_client.put_object(Bucket=self.bucket.name, Key=obj_name, Body=DUIS)
   159  
   160          # warm GET and check (expecting the first version's content)
   161          content = self.bucket.object(obj_name).get().read_all()
   162          self.assertEqual(LOREM, content.decode("utf-8"))
   163  
   164          # warm GET with `--latest` flag, content should be updated
   165          content = self.bucket.object(obj_name).get(latest=True).read_all()
   166          self.assertEqual(DUIS, content.decode("utf-8"))
   167  
   168          # out-of-band DELETE
   169          s3_client.delete_object(Bucket=self.bucket.name, Key=obj_name)
   170  
   171          # warm GET must be fine
   172          content = self.bucket.object(obj_name).get().read_all()
   173          self.assertEqual(DUIS, content.decode("utf-8"))
   174  
   175          # cold GET must result in Error
   176          with self.assertRaises(AISError):
   177              self.bucket.object(obj_name).get(latest=True)
   178  
   179      @unittest.skipIf(
   180          not REMOTE_SET,
   181          "Remote bucket is not set",
   182      )
   183      def test_evict(self):
   184          self._create_objects()
   185          objects = self.bucket.list_objects(
   186              props="name,cached", prefix=self.obj_prefix
   187          ).entries
   188          self._validate_objects_cached(objects, True)
   189  
   190          self.bucket.evict(keep_md=True)
   191  
   192          objects = self.bucket.list_objects(
   193              props="name,cached", prefix=self.obj_prefix
   194          ).entries
   195          self.assertEqual(OBJECT_COUNT, len(objects))
   196          self._validate_objects_cached(objects, False)
   197  
   198      def test_evict_local(self):
   199          # If the bucket is local, eviction should fail
   200          if not REMOTE_SET:
   201              with self.assertRaises(InvalidBckProvider):
   202                  self.bucket.evict()
   203              return
   204          # Create a local bucket to test with if self.bucket is a cloud bucket
   205          local_bucket = self._create_bucket(self.bck_name + "-local")
   206          with self.assertRaises(InvalidBckProvider):
   207              local_bucket.evict()
   208  
   209      def test_put_files_invalid(self):
   210          with self.assertRaises(ValueError):
   211              self.bucket.put_files("non-existent-dir")
   212          self.local_test_files.mkdir()
   213          filename = self.local_test_files.joinpath("file_not_dir")
   214          with open(filename, "w", encoding=UTF_ENCODING):
   215              pass
   216          with self.assertRaises(ValueError):
   217              self.bucket.put_files(filename)
   218  
   219      def _verify_obj_res(self, expected_res_dict, expect_err=False):
   220          if expect_err:
   221              for obj_name in expected_res_dict:
   222                  with self.assertRaises(AISError):
   223                      self.bucket.object(self.obj_prefix + obj_name).get()
   224          else:
   225              for obj_name, expected_data in expected_res_dict.items():
   226                  res = self.bucket.object(self.obj_prefix + obj_name).get()
   227                  self.assertEqual(expected_data, res.read_all())
   228  
   229      def test_put_files_default_args(self):
   230          self._create_put_files_structure(TOP_LEVEL_FILES, LOWER_LEVEL_FILES)
   231          self.bucket.put_files(self.local_test_files, prepend=self.obj_prefix)
   232          self._verify_obj_res(TOP_LEVEL_FILES)
   233          self._verify_obj_res(LOWER_LEVEL_FILES, expect_err=True)
   234  
   235      def test_put_files_recursive(self):
   236          self._create_put_files_structure(TOP_LEVEL_FILES, LOWER_LEVEL_FILES)
   237          self.bucket.put_files(
   238              self.local_test_files, recursive=True, prepend=self.obj_prefix
   239          )
   240  
   241          self._verify_obj_res(TOP_LEVEL_FILES)
   242          # Lower level file object names will include their relative path by default
   243          expected_lower_res = {}
   244          for obj_name, expected_data in LOWER_LEVEL_FILES.items():
   245              obj_name = str(Path(INNER_DIR).joinpath(obj_name))
   246              expected_lower_res[obj_name] = expected_data
   247          self._verify_obj_res(expected_lower_res)
   248  
   249      def test_put_files_recursive_basename(self):
   250          self._create_put_files_structure(TOP_LEVEL_FILES, LOWER_LEVEL_FILES)
   251          self.bucket.put_files(
   252              self.local_test_files,
   253              recursive=True,
   254              basename=True,
   255              prepend=self.obj_prefix,
   256          )
   257  
   258          # Expect all objects to be prefixed by custom_name and with no relative path in the name due to basename opt
   259          joined_file_data = {**TOP_LEVEL_FILES, **LOWER_LEVEL_FILES}
   260          expected_res = {}
   261          for obj_name, expected_data in joined_file_data.items():
   262              expected_res[obj_name] = expected_data
   263          self._verify_obj_res(expected_res)
   264  
   265      def test_put_files_filtered(self):
   266          self.local_test_files.mkdir()
   267          included_filename = "prefix-file.txt"
   268          excluded_by_pattern = "extra_top_file.py"
   269          excluded_by_prefix = "non-prefix-file.txt"
   270          for file in [included_filename, excluded_by_pattern, excluded_by_prefix]:
   271              with open(self.local_test_files.joinpath(file), "wb"):
   272                  pass
   273          self.bucket.put_files(
   274              self.local_test_files,
   275              prepend=self.obj_prefix,
   276              prefix_filter=PREFIX_NAME,
   277              pattern="*.txt",
   278          )
   279          self.bucket.object(self.obj_prefix + included_filename).get()
   280          with self.assertRaises(AISError):
   281              self.bucket.object(excluded_by_pattern).get()
   282          with self.assertRaises(AISError):
   283              self.bucket.object(excluded_by_prefix).get()
   284  
   285      def test_put_files_dry_run(self):
   286          self._create_put_files_structure(TOP_LEVEL_FILES, LOWER_LEVEL_FILES)
   287          self.bucket.put_files(
   288              self.local_test_files, dry_run=True, prepend=self.obj_prefix
   289          )
   290          # Verify the put files call does not actually create objects
   291          self._verify_obj_res(TOP_LEVEL_FILES, expect_err=True)
   292  
   293      @test_cases((None, OBJECT_COUNT), (7, 7), (OBJECT_COUNT * 2, OBJECT_COUNT))
   294      def test_list_objects(self, test_case):
   295          page_size, response_size = test_case
   296          # Only create the bucket entries on the first subtest run
   297          if len(self.bucket.list_all_objects(prefix=self.obj_prefix)) == 0:
   298              self._create_objects()
   299          if page_size:
   300              resp = self.bucket.list_objects(page_size=page_size, prefix=self.obj_prefix)
   301          else:
   302              resp = self.bucket.list_objects(prefix=self.obj_prefix)
   303          self.assertEqual(response_size, len(resp.entries))
   304  
   305      def test_list_all_objects(self):
   306          short_page_len = 17
   307          self._create_objects()
   308          objects = self.bucket.list_all_objects(prefix=self.obj_prefix)
   309          self.assertEqual(OBJECT_COUNT, len(objects))
   310          objects = self.bucket.list_all_objects(
   311              page_size=short_page_len, prefix=self.obj_prefix
   312          )
   313          self.assertEqual(OBJECT_COUNT, len(objects))
   314  
   315      def test_list_object_iter(self):
   316          obj_names = set(self._create_objects())
   317  
   318          # Empty iterator if there are no objects matching the prefix.
   319          obj_iter = self.bucket.list_objects_iter(prefix="invalid-obj-")
   320          self.assertEqual(0, len(list(obj_iter)))
   321  
   322          # Read all `bucket_size` objects by prefix.
   323          obj_iter = self.bucket.list_objects_iter(page_size=10, prefix=self.obj_prefix)
   324          for obj in obj_iter:
   325              obj_names.remove(obj.name)
   326          self.assertEqual(0, len(obj_names))
   327  
   328      def test_list_object_flags(self):
   329          self._create_objects()
   330          objects = self.bucket.list_all_objects(
   331              flags=[ListObjectFlag.NAME_ONLY, ListObjectFlag.CACHED],
   332              prefix=self.obj_prefix,
   333          )
   334          self.assertEqual(OBJECT_COUNT, len(objects))
   335          for obj in objects:
   336              self.assertEqual(0, obj.size)
   337  
   338          objects = self.bucket.list_all_objects(
   339              flags=[ListObjectFlag.NAME_SIZE], prefix=self.obj_prefix
   340          )
   341          self.assertEqual(OBJECT_COUNT, len(objects))
   342          for obj in objects:
   343              self.assertTrue(obj.size > 0)
   344  
   345      def test_summary(self):
   346          summ_test_bck = self._create_bucket("summary-test")
   347  
   348          # Initially, the bucket should be empty
   349          bucket_summary = summ_test_bck.summary()
   350  
   351          self.assertEqual(bucket_summary["ObjCount"]["obj_count_present"], "0")
   352          self.assertEqual(bucket_summary["TotalSize"]["size_all_present_objs"], "0")
   353          self.assertEqual(bucket_summary["TotalSize"]["size_all_remote_objs"], "0")
   354          self.assertEqual(bucket_summary["used_pct"], 0)
   355  
   356          summ_test_bck.object(OBJ_NAME).put_content(OBJ_CONTENT)
   357  
   358          bucket_summary = summ_test_bck.summary()
   359  
   360          # Now, the bucket should have 1 object
   361          self.assertEqual(bucket_summary["ObjCount"]["obj_count_present"], "1")
   362          self.assertNotEqual(bucket_summary["TotalSize"]["size_all_present_objs"], "0")
   363  
   364          summ_test_bck.delete()
   365  
   366          # Accessing the summary of a deleted bucket should raise an error
   367          with self.assertRaises(ErrBckNotFound):
   368              summ_test_bck.summary()
   369  
   370      def test_info(self):
   371          info_test_bck = self._create_bucket("info-test")
   372  
   373          # Initially, the bucket should be empty
   374          _, bck_summ = info_test_bck.info(flt_presence=0)
   375  
   376          # For an empty bucket, the object count and total size should be zero
   377          self.assertEqual(bck_summ["ObjCount"]["obj_count_present"], "0")
   378          self.assertEqual(bck_summ["TotalSize"]["size_all_present_objs"], "0")
   379          self.assertEqual(bck_summ["TotalSize"]["size_all_remote_objs"], "0")
   380          self.assertEqual(bck_summ["provider"], "ais")
   381          self.assertEqual(bck_summ["name"], "info-test")
   382  
   383          # Upload an object to the bucket
   384          info_test_bck.object(OBJ_NAME).put_content(OBJ_CONTENT)
   385  
   386          _, bck_summ = info_test_bck.info()
   387  
   388          # Now the bucket should have one object and non-zero size
   389          self.assertEqual(bck_summ["ObjCount"]["obj_count_present"], "1")
   390          self.assertNotEqual(bck_summ["TotalSize"]["size_all_present_objs"], "0")
   391          self.assertEqual(bck_summ["TotalSize"]["size_all_remote_objs"], "0")
   392          self.assertEqual(bck_summ["provider"], "ais")
   393          self.assertEqual(bck_summ["name"], "info-test")
   394  
   395          info_test_bck.delete()
   396  
   397          # Accessing the info of a deleted bucket should raise an error
   398          with self.assertRaises(ErrBckNotFound):
   399              info_test_bck.summary()
   400  
   401      def test_write_dataset(self):
   402          self.local_test_files.mkdir(exist_ok=True)
   403          dataset_directory = self.local_test_files.joinpath(DATASET_DIR)
   404          dataset_directory.mkdir(exist_ok=True)
   405          img_files = {
   406              "file1.jpg": b"file1",
   407              "file2.jpg": b"file2",
   408              "file3.jpg": b"file3",
   409          }
   410          _create_files(dataset_directory, img_files)
   411  
   412          dataset_config = DatasetConfig(
   413              primary_attribute=DataAttribute(
   414                  path=dataset_directory, name="image", file_type="jpg"
   415              ),
   416              secondary_attributes=[
   417                  LabelAttribute(
   418                      name="label", label_identifier=lambda filename: f"{filename}_label"
   419                  )
   420              ],
   421          )
   422          shards = []
   423  
   424          def post_process(shard_path):
   425              self._register_for_post_test_cleanup(names=[shard_path], is_bucket=False)
   426              shards.append(shard_path)
   427  
   428          self.bucket.write_dataset(
   429              dataset_config, pattern="dataset", maxcount=10, post=post_process
   430          )
   431          self.assertEqual(len(shards), 1)
   432          for shard in shards:
   433              self.assertIsNotNone(self.bucket.object(shard).head())
   434  
   435      def test_write_dataset_missing_attributes(self):
   436          self.local_test_files.mkdir(exist_ok=True)
   437          dataset_directory = self.local_test_files.joinpath(DATASET_DIR)
   438          dataset_directory.mkdir(exist_ok=True)
   439          img_files = {
   440              "file1.jpg": b"file1",
   441              "file2.jpg": b"file2",
   442              "file3.jpg": b"file3",
   443          }
   444          _create_files(dataset_directory, img_files)
   445  
   446          dataset_config = DatasetConfig(
   447              primary_attribute=DataAttribute(
   448                  path=dataset_directory, name="image", file_type="jpg"
   449              ),
   450              secondary_attributes=[
   451                  LabelAttribute(name="cls", label_identifier=lambda filename: None)
   452              ],
   453          )
   454          self.bucket.write_dataset(
   455              dataset_config, skip_missing=False, pattern="dataset", maxcount=10
   456          )