github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/python/tests/integration/sdk/test_object_group_ops.py (about) 1 # 2 # Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved. 3 # 4 import hashlib 5 import unittest 6 import tarfile 7 import io 8 from datetime import datetime 9 10 import pytest 11 12 from aistore.sdk.const import PROVIDER_AIS, LOREM, DUIS 13 from aistore.sdk.errors import InvalidBckProvider, AISError, JobInfoNotFound 14 from tests.const import ( 15 SMALL_FILE_SIZE, 16 MIB, 17 OBJECT_COUNT, 18 TEST_TIMEOUT, 19 PREFIX_NAME, 20 SUFFIX_NAME, 21 ) 22 from tests.integration import REMOTE_SET 23 from tests.integration.sdk.remote_enabled_test import RemoteEnabledTest 24 from tests.utils import random_string 25 26 27 # pylint: disable=unused-variable,too-many-instance-attributes 28 class TestObjectGroupOps(RemoteEnabledTest): 29 def setUp(self) -> None: 30 super().setUp() 31 self.suffix = SUFFIX_NAME 32 self.obj_names = self._create_objects(suffix=self.suffix) 33 if REMOTE_SET: 34 self.s3_client = self._get_boto3_client() 35 36 def test_delete(self): 37 object_group = self.bucket.objects(obj_names=self.obj_names[1:]) 38 job_id = object_group.delete() 39 self.client.job(job_id).wait(timeout=TEST_TIMEOUT) 40 existing_objects = self.bucket.list_objects(prefix=self.obj_prefix).entries 41 self.assertEqual(1, len(existing_objects)) 42 self.assertEqual(self.obj_names[0], existing_objects[0].name) 43 44 @unittest.skipIf( 45 not REMOTE_SET, 46 "Remote bucket is not set", 47 ) 48 def test_evict(self): 49 object_group = self.bucket.objects(obj_names=self.obj_names[1:]) 50 job_id = object_group.evict() 51 self.client.job(job_id).wait(timeout=TEST_TIMEOUT) 52 self._verify_cached_objects(OBJECT_COUNT, [0]) 53 54 def test_evict_objects_local(self): 55 local_bucket = self.client.bucket(random_string(), provider=PROVIDER_AIS) 56 with self.assertRaises(InvalidBckProvider): 57 local_bucket.objects(obj_names=[]).evict() 58 59 @unittest.skipIf( 60 not REMOTE_SET, 61 "Remote bucket is not set", 62 ) 63 def test_prefetch_list(self): 64 obj_group = self.bucket.objects(obj_names=self.obj_names[1:]) 65 self._evict_all_objects() 66 # Fetch back a specific object group and verify cache status 67 job_id = obj_group.prefetch() 68 self.client.job(job_id).wait(timeout=TEST_TIMEOUT * 2) 69 self._verify_cached_objects(OBJECT_COUNT, range(1, OBJECT_COUNT)) 70 71 @unittest.skipIf( 72 not REMOTE_SET, 73 "Remote bucket is not set", 74 ) 75 def test_prefetch_blob_download(self): 76 obj_name = self.obj_prefix + str(OBJECT_COUNT) + self.suffix 77 obj_names = self._create_objects(obj_names=[obj_name], obj_size=SMALL_FILE_SIZE) 78 self.obj_names.extend(obj_names) 79 obj_group = self.bucket.objects(obj_names=obj_names) 80 self._evict_all_objects(num_obj=OBJECT_COUNT + 1) 81 start_time = datetime.now().time() 82 job_id = obj_group.prefetch(blob_threshold=2 * MIB) 83 self.client.job(job_id=job_id).wait(timeout=TEST_TIMEOUT * 2) 84 end_time = datetime.now().time() 85 jobs_list = self.client.job(job_kind="blob-download").get_within_timeframe( 86 start_time=start_time, end_time=end_time 87 ) 88 self.assertTrue(len(jobs_list) > 0) 89 self._verify_cached_objects( 90 OBJECT_COUNT + 1, range(OBJECT_COUNT, OBJECT_COUNT + 1) 91 ) 92 93 @unittest.skipIf( 94 not REMOTE_SET, 95 "Remote bucket is not set", 96 ) 97 def test_prefetch_without_blob_download(self): 98 obj_name = self.obj_prefix + str(OBJECT_COUNT) + self.suffix 99 obj_names = self._create_objects(obj_names=[obj_name], obj_size=SMALL_FILE_SIZE) 100 self.obj_names.extend(obj_names) 101 obj_group = self.bucket.objects(obj_names=obj_names) 102 self._evict_all_objects(num_obj=OBJECT_COUNT + 1) 103 start_time = datetime.now().time() 104 job_id = obj_group.prefetch(blob_threshold=2 * SMALL_FILE_SIZE) 105 self.client.job(job_id=job_id).wait(timeout=TEST_TIMEOUT * 2) 106 end_time = datetime.now().time() 107 108 with self.assertRaises(JobInfoNotFound): 109 self.client.job(job_kind="blob-download").get_within_timeframe( 110 start_time=start_time, end_time=end_time 111 ) 112 113 self._verify_cached_objects( 114 OBJECT_COUNT + 1, range(OBJECT_COUNT, OBJECT_COUNT + 1) 115 ) 116 117 def test_prefetch_objects_local(self): 118 local_bucket = self.client.bucket(random_string(), provider=PROVIDER_AIS) 119 with self.assertRaises(InvalidBckProvider): 120 local_bucket.objects(obj_names=[]).prefetch() 121 122 def test_copy_objects(self): 123 to_bck_name = "destination-bucket" 124 to_bck = self._create_bucket(to_bck_name) 125 self.assertEqual(0, len(to_bck.list_all_objects(prefix=self.obj_prefix))) 126 self.assertEqual( 127 OBJECT_COUNT, len(self.bucket.list_all_objects(prefix=self.obj_prefix)) 128 ) 129 130 new_prefix = PREFIX_NAME 131 copy_job = self.bucket.objects(obj_names=self.obj_names[1:5]).copy( 132 to_bck, prepend=new_prefix 133 ) 134 self.client.job(job_id=copy_job).wait_for_idle(timeout=TEST_TIMEOUT) 135 136 self.assertEqual( 137 4, len(to_bck.list_all_objects(prefix=new_prefix + self.obj_prefix)) 138 ) 139 140 @unittest.skipIf( 141 not REMOTE_SET, 142 "Remote bucket is not set", 143 ) 144 def test_copy_objects_latest_flag(self): 145 obj_name = random_string() 146 self._register_for_post_test_cleanup(names=[obj_name], is_bucket=False) 147 to_bck_name = "dst-bck-cp-latest" 148 to_bck = self._create_bucket(to_bck_name) 149 150 # out-of-band PUT: first version 151 self.s3_client.put_object(Bucket=self.bucket.name, Key=obj_name, Body=LOREM) 152 153 # copy, and check 154 self._copy_and_check_with_latest(self.bucket, to_bck, obj_name, LOREM, False) 155 # create a cached copy in src bucket 156 content = self.bucket.object(obj_name).get().read_all() 157 self.assertEqual(LOREM, content.decode("utf-8")) 158 159 # out-of-band PUT: 2nd version (overwrite) 160 self.s3_client.put_object(Bucket=self.bucket.name, Key=obj_name, Body=DUIS) 161 162 # copy and check (expecting the first version) 163 self._copy_and_check_with_latest(self.bucket, to_bck, obj_name, LOREM, False) 164 165 # copy latest: update in-cluster copy 166 self._copy_and_check_with_latest(self.bucket, to_bck, obj_name, DUIS, True) 167 # check if cached copy is src bck is still on prev version 168 content = self.bucket.object(obj_name).get().read_all() 169 self.assertEqual(LOREM, content.decode("utf-8")) 170 171 # out-of-band DELETE 172 self.s3_client.delete_object(Bucket=self.bucket.name, Key=obj_name) 173 174 # copy and check (expecting no changes) 175 self._copy_and_check_with_latest(self.bucket, to_bck, obj_name, DUIS, True) 176 177 # run copy with '--sync' one last time, and make sure the object "disappears" 178 copy_job = self.bucket.objects(obj_names=[obj_name]).copy( 179 self.bucket, sync=True 180 ) 181 self.client.job(job_id=copy_job).wait_for_idle(timeout=TEST_TIMEOUT) 182 with self.assertRaises(AISError): 183 self.bucket.object(obj_name).get() 184 185 @unittest.skipIf( 186 not REMOTE_SET, 187 "Remote bucket is not set", 188 ) 189 def test_copy_objects_sync_flag(self): 190 to_bck_name = "dst-bck-cp-sync" 191 to_bck = self._create_bucket(to_bck_name) 192 193 # run copy with '--sync' on different dst, and make sure the object "disappears" 194 # multi-obj --sync currently only supports templates 195 # TODO: add test for multi-obj list --sync once api is ready 196 template = self.obj_prefix + "{0..10}" + self.suffix 197 copy_job = self.bucket.objects(obj_template=template).copy(to_bck) 198 self.client.job(job_id=copy_job).wait_for_idle(timeout=TEST_TIMEOUT) 199 self.assertEqual( 200 len(to_bck.list_all_objects(prefix=self.obj_prefix)), OBJECT_COUNT 201 ) 202 203 prefetch_job = self.bucket.objects(obj_template=template).prefetch() 204 self.client.job(job_id=prefetch_job).wait_for_idle(timeout=TEST_TIMEOUT) 205 206 # out of band delete all objects 207 for obj_name in self.obj_names: 208 self.s3_client.delete_object(Bucket=self.bucket.name, Key=obj_name) 209 210 copy_job = self.bucket.objects(obj_template=template).copy(to_bck, sync=True) 211 self.client.job(job_id=copy_job).wait_for_idle(timeout=TEST_TIMEOUT) 212 # check to see if all the objects in dst disapear after cp multi-obj sync 213 self.assertEqual(len(to_bck.list_all_objects(prefix=self.obj_prefix)), 0) 214 # objects also disapear from src bck 215 self.assertEqual(len(self.bucket.list_all_objects(prefix=self.obj_prefix)), 0) 216 217 @unittest.skipIf( 218 not REMOTE_SET, 219 "Remote bucket is not set", 220 ) 221 def test_prefetch_objects_latest_flag(self): 222 obj_name = random_string() 223 self._register_for_post_test_cleanup(names=[obj_name], is_bucket=False) 224 225 # out-of-band PUT: first version 226 self.s3_client.put_object(Bucket=self.bucket.name, Key=obj_name, Body=LOREM) 227 228 # prefetch, and check 229 self._prefetch_and_check_with_latest(self.bucket, obj_name, LOREM, False) 230 231 # out-of-band PUT: 2nd version (overwrite) 232 self.s3_client.put_object(Bucket=self.bucket.name, Key=obj_name, Body=DUIS) 233 234 # prefetch and check (expecting the first version) 235 self._prefetch_and_check_with_latest(self.bucket, obj_name, LOREM, False) 236 237 # prefetch latest: update in-cluster copy 238 self._prefetch_and_check_with_latest(self.bucket, obj_name, DUIS, True) 239 240 # out-of-band DELETE 241 self.s3_client.delete_object(Bucket=self.bucket.name, Key=obj_name) 242 243 # prefetch without '--latest': expecting no changes 244 self._prefetch_and_check_with_latest(self.bucket, obj_name, DUIS, False) 245 246 # run prefetch with '--latest' one last time, and make sure the object "disappears" 247 prefetch_job = self.bucket.objects(obj_names=[obj_name]).prefetch(latest=True) 248 self.client.job(job_id=prefetch_job).wait_for_idle(timeout=TEST_TIMEOUT) 249 with self.assertRaises(AISError): 250 self.bucket.object(obj_name).get() 251 252 def _prefetch_and_check_with_latest(self, bucket, obj_name, expected, latest_flag): 253 prefetch_job = bucket.objects(obj_names=[obj_name]).prefetch(latest=latest_flag) 254 self.client.job(job_id=prefetch_job).wait_for_idle(timeout=TEST_TIMEOUT) 255 256 content = bucket.object(obj_name).get().read_all() 257 self.assertEqual(expected, content.decode("utf-8")) 258 259 # pylint: disable=too-many-arguments 260 def _copy_and_check_with_latest( 261 self, from_bck, to_bck, obj_name, expected, latest_flag 262 ): 263 copy_job = from_bck.objects(obj_names=[obj_name]).copy( 264 to_bck, latest=latest_flag 265 ) 266 self.client.job(job_id=copy_job).wait_for_idle(timeout=TEST_TIMEOUT) 267 self.assertEqual(1, len(to_bck.list_all_objects())) 268 content = to_bck.object(obj_name).get().read_all() 269 self.assertEqual(expected, content.decode("utf-8")) 270 271 def test_archive_objects_without_copy(self): 272 arch_name = self.obj_prefix + "-archive-without-copy.tar" 273 self._archive_exec_assert(arch_name, self.bucket, self.bucket) 274 275 def test_archive_objects_with_copy(self): 276 arch_name = self.obj_prefix + "-archive-with-copy.tar" 277 dest_bck = self._create_bucket(random_string()) 278 self._archive_exec_assert(arch_name, self.bucket, dest_bck, to_bck=dest_bck) 279 280 def _archive_exec_assert(self, arch_name, src_bck, res_bck, **kwargs): 281 # Add to object list to clean up on test finish 282 if res_bck.provider != PROVIDER_AIS: 283 self._register_for_post_test_cleanup(names=[arch_name], is_bucket=False) 284 archived_names = self.obj_names[1:5] 285 expected_contents = {} 286 for name in archived_names: 287 expected_contents[name] = src_bck.object(obj_name=name).get().read_all() 288 289 arch_job = src_bck.objects(obj_names=archived_names).archive( 290 archive_name=arch_name, **kwargs 291 ) 292 self.client.job(job_id=arch_job).wait_for_idle(timeout=TEST_TIMEOUT) 293 294 # Read the tar archive and assert the object names and contents match 295 res_bytes = res_bck.object(arch_name).get().read_all() 296 with tarfile.open(fileobj=io.BytesIO(res_bytes), mode="r") as tar: 297 member_names = [] 298 for member in tar.getmembers(): 299 inner_file = tar.extractfile(member) 300 self.assertEqual(expected_contents[member.name], inner_file.read()) 301 inner_file.close() 302 member_names.append(member.name) 303 self.assertEqual(set(archived_names), set(member_names)) 304 305 @pytest.mark.etl 306 def test_transform_objects(self): 307 # Define an etl with code that hashes the contents of each object 308 etl_name = "etl-" + random_string(5) 309 310 def transform(input_bytes): 311 md5 = hashlib.md5() 312 md5.update(input_bytes) 313 return md5.hexdigest().encode() 314 315 md5_etl = self.client.etl(etl_name) 316 md5_etl.init_code(transform=transform) 317 318 to_bck_name = "destination-bucket" 319 to_bck = self._create_bucket(to_bck_name) 320 new_prefix = PREFIX_NAME 321 self.assertEqual(0, len(to_bck.list_all_objects(prefix=self.obj_prefix))) 322 self.assertEqual( 323 OBJECT_COUNT, len(self.bucket.list_all_objects(prefix=self.obj_prefix)) 324 ) 325 326 transform_job = self.bucket.objects(obj_names=self.obj_names).transform( 327 to_bck, etl_name=md5_etl.name, prepend=new_prefix 328 ) 329 self.client.job(job_id=transform_job).wait_for_idle(timeout=TEST_TIMEOUT) 330 331 # Get the md5 transform of each source object and verify the destination bucket contains those results 332 from_obj_hashes = [ 333 transform(self.bucket.object(name).get().read_all()) 334 for name in self.obj_names 335 ] 336 to_obj_values = [ 337 to_bck.object(new_prefix + name).get().read_all() for name in self.obj_names 338 ] 339 self.assertEqual(to_obj_values, from_obj_hashes) 340 341 def _evict_all_objects(self, num_obj=OBJECT_COUNT): 342 job_id = self.bucket.objects(obj_names=self.obj_names).evict() 343 self.client.job(job_id).wait(timeout=TEST_TIMEOUT) 344 self._check_all_objects_cached(num_obj, expected_cached=False)