github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/python/tests/integration/sdk/test_bucket_ops.py (about) 1 # 2 # Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved. 3 # 4 import unittest 5 from pathlib import Path 6 7 import boto3 8 9 import requests 10 11 from aistore.sdk import ListObjectFlag 12 from aistore.sdk.const import UTF_ENCODING, LOREM, DUIS 13 from aistore.sdk.dataset.dataset_config import DatasetConfig 14 from aistore.sdk.dataset.data_attribute import DataAttribute 15 from aistore.sdk.dataset.label_attribute import LabelAttribute 16 from aistore.sdk.errors import InvalidBckProvider, AISError, ErrBckNotFound 17 18 from tests.integration.sdk.remote_enabled_test import RemoteEnabledTest 19 from tests import AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY 20 from tests.integration.boto3 import AWS_REGION 21 22 from tests.utils import random_string, cleanup_local, test_cases 23 from tests.const import OBJ_NAME, OBJECT_COUNT, OBJ_CONTENT, PREFIX_NAME 24 from tests.integration import REMOTE_SET 25 26 INNER_DIR = "directory" 27 DATASET_DIR = "dataset" 28 TOP_LEVEL_FILES = { 29 "top_level_file.txt": b"test data to verify", 30 "other_top_level_file.txt": b"other file test data to verify", 31 } 32 LOWER_LEVEL_FILES = {"lower_level_file.txt": b"data in inner file"} 33 34 35 def _create_files(folder, file_dict): 36 for filename, data in file_dict.items(): 37 lower_file = folder.joinpath(filename) 38 with open(lower_file, "wb") as file: 39 file.write(data) 40 41 42 # pylint: disable=unused-variable, too-many-public-methods 43 class TestBucketOps(RemoteEnabledTest): 44 def setUp(self) -> None: 45 super().setUp() 46 self.local_test_files = ( 47 Path().absolute().joinpath("bucket-ops-test-" + random_string(8)) 48 ) 49 50 def tearDown(self) -> None: 51 super().tearDown() 52 cleanup_local(str(self.local_test_files)) 53 54 def _create_put_files_structure(self, top_level_files, lower_level_files): 55 self.local_test_files.mkdir(exist_ok=True) 56 _create_files(self.local_test_files, top_level_files) 57 inner_dir = self.local_test_files.joinpath(INNER_DIR) 58 inner_dir.mkdir() 59 _create_files(inner_dir, lower_level_files) 60 61 def test_bucket(self): 62 new_bck_name = random_string(10) 63 self._create_bucket(new_bck_name) 64 res = self.client.cluster().list_buckets() 65 bucket_names = {bck.name for bck in res} 66 self.assertIn(new_bck_name, bucket_names) 67 68 @test_cases( 69 "*", ".", "", " ", "bucket/name", "bucket and name", "#name", "$name", "~name" 70 ) 71 def test_create_bucket_invalid_name(self, testcase): 72 with self.assertRaises(AISError): 73 self._create_bucket(testcase) 74 75 def test_bucket_invalid_name(self): 76 with self.assertRaises(ErrBckNotFound): 77 self.client.bucket("INVALID_BCK_NAME").list_objects() 78 79 def test_bucket_invalid_aws_name(self): 80 with self.assertRaises(AISError): 81 self.client.bucket("INVALID_BCK_NAME", "aws").list_objects() 82 83 def test_head(self): 84 try: 85 self.bucket.head() 86 except requests.exceptions.HTTPError as err: 87 self.assertEqual(err.response.status_code, 404) 88 89 def test_rename(self): 90 from_bck_name = self.bck_name + "from" 91 to_bck_name = self.bck_name + "to" 92 from_bck = self._create_bucket(from_bck_name) 93 self.client.cluster().list_buckets() 94 95 self.assertEqual(from_bck_name, from_bck.name) 96 job_id = from_bck.rename(to_bck_name=to_bck_name) 97 self.assertNotEqual(job_id, "") 98 99 # wait for rename to finish 100 self.client.job(job_id).wait() 101 102 # new bucket should be created and accessible 103 to_bck = self.client.bucket(to_bck_name) 104 to_bck.head() 105 self.assertEqual(to_bck_name, to_bck.name) 106 107 # old bucket should be inaccessible 108 try: 109 from_bck.head() 110 except requests.exceptions.HTTPError as err: 111 self.assertEqual(err.response.status_code, 404) 112 self._register_for_post_test_cleanup(names=[to_bck_name], is_bucket=True) 113 114 def test_copy(self): 115 from_bck_name = self.bck_name + "from" 116 to_bck_name = self.bck_name + "to" 117 from_bck = self._create_bucket(from_bck_name) 118 to_bck = self._create_bucket(to_bck_name) 119 prefix = PREFIX_NAME 120 new_prefix = "new-" 121 content = b"test" 122 expected_name = prefix + "-obj" 123 from_bck.object(expected_name).put_content(content) 124 from_bck.object("notprefix-obj").put_content(content) 125 126 job_id = from_bck.copy(to_bck, prefix_filter=prefix, prepend=new_prefix) 127 128 self.assertNotEqual(job_id, "") 129 self.client.job(job_id).wait() 130 entries = to_bck.list_all_objects() 131 self.assertEqual(1, len(entries)) 132 self.assertEqual(new_prefix + expected_name, entries[0].name) 133 134 @unittest.skipIf( 135 not REMOTE_SET, 136 "Remote bucket is not set", 137 ) 138 def test_get_latest_flag(self): 139 obj_name = random_string() 140 self._register_for_post_test_cleanup(names=[obj_name], is_bucket=False) 141 142 s3_client = boto3.client( 143 "s3", 144 region_name=AWS_REGION, 145 aws_access_key_id=AWS_ACCESS_KEY_ID, 146 aws_secret_access_key=AWS_SECRET_ACCESS_KEY, 147 # aws_session_token=AWS_SESSION_TOKEN, 148 ) 149 150 # out-of-band PUT: first version 151 s3_client.put_object(Bucket=self.bucket.name, Key=obj_name, Body=LOREM) 152 153 # cold GET, and check 154 content = self.bucket.object(obj_name).get().read_all() 155 self.assertEqual(LOREM, content.decode("utf-8")) 156 157 # out-of-band PUT: 2nd version (overwrite) 158 s3_client.put_object(Bucket=self.bucket.name, Key=obj_name, Body=DUIS) 159 160 # warm GET and check (expecting the first version's content) 161 content = self.bucket.object(obj_name).get().read_all() 162 self.assertEqual(LOREM, content.decode("utf-8")) 163 164 # warm GET with `--latest` flag, content should be updated 165 content = self.bucket.object(obj_name).get(latest=True).read_all() 166 self.assertEqual(DUIS, content.decode("utf-8")) 167 168 # out-of-band DELETE 169 s3_client.delete_object(Bucket=self.bucket.name, Key=obj_name) 170 171 # warm GET must be fine 172 content = self.bucket.object(obj_name).get().read_all() 173 self.assertEqual(DUIS, content.decode("utf-8")) 174 175 # cold GET must result in Error 176 with self.assertRaises(AISError): 177 self.bucket.object(obj_name).get(latest=True) 178 179 @unittest.skipIf( 180 not REMOTE_SET, 181 "Remote bucket is not set", 182 ) 183 def test_evict(self): 184 self._create_objects() 185 objects = self.bucket.list_objects( 186 props="name,cached", prefix=self.obj_prefix 187 ).entries 188 self._validate_objects_cached(objects, True) 189 190 self.bucket.evict(keep_md=True) 191 192 objects = self.bucket.list_objects( 193 props="name,cached", prefix=self.obj_prefix 194 ).entries 195 self.assertEqual(OBJECT_COUNT, len(objects)) 196 self._validate_objects_cached(objects, False) 197 198 def test_evict_local(self): 199 # If the bucket is local, eviction should fail 200 if not REMOTE_SET: 201 with self.assertRaises(InvalidBckProvider): 202 self.bucket.evict() 203 return 204 # Create a local bucket to test with if self.bucket is a cloud bucket 205 local_bucket = self._create_bucket(self.bck_name + "-local") 206 with self.assertRaises(InvalidBckProvider): 207 local_bucket.evict() 208 209 def test_put_files_invalid(self): 210 with self.assertRaises(ValueError): 211 self.bucket.put_files("non-existent-dir") 212 self.local_test_files.mkdir() 213 filename = self.local_test_files.joinpath("file_not_dir") 214 with open(filename, "w", encoding=UTF_ENCODING): 215 pass 216 with self.assertRaises(ValueError): 217 self.bucket.put_files(filename) 218 219 def _verify_obj_res(self, expected_res_dict, expect_err=False): 220 if expect_err: 221 for obj_name in expected_res_dict: 222 with self.assertRaises(AISError): 223 self.bucket.object(self.obj_prefix + obj_name).get() 224 else: 225 for obj_name, expected_data in expected_res_dict.items(): 226 res = self.bucket.object(self.obj_prefix + obj_name).get() 227 self.assertEqual(expected_data, res.read_all()) 228 229 def test_put_files_default_args(self): 230 self._create_put_files_structure(TOP_LEVEL_FILES, LOWER_LEVEL_FILES) 231 self.bucket.put_files(self.local_test_files, prepend=self.obj_prefix) 232 self._verify_obj_res(TOP_LEVEL_FILES) 233 self._verify_obj_res(LOWER_LEVEL_FILES, expect_err=True) 234 235 def test_put_files_recursive(self): 236 self._create_put_files_structure(TOP_LEVEL_FILES, LOWER_LEVEL_FILES) 237 self.bucket.put_files( 238 self.local_test_files, recursive=True, prepend=self.obj_prefix 239 ) 240 241 self._verify_obj_res(TOP_LEVEL_FILES) 242 # Lower level file object names will include their relative path by default 243 expected_lower_res = {} 244 for obj_name, expected_data in LOWER_LEVEL_FILES.items(): 245 obj_name = str(Path(INNER_DIR).joinpath(obj_name)) 246 expected_lower_res[obj_name] = expected_data 247 self._verify_obj_res(expected_lower_res) 248 249 def test_put_files_recursive_basename(self): 250 self._create_put_files_structure(TOP_LEVEL_FILES, LOWER_LEVEL_FILES) 251 self.bucket.put_files( 252 self.local_test_files, 253 recursive=True, 254 basename=True, 255 prepend=self.obj_prefix, 256 ) 257 258 # Expect all objects to be prefixed by custom_name and with no relative path in the name due to basename opt 259 joined_file_data = {**TOP_LEVEL_FILES, **LOWER_LEVEL_FILES} 260 expected_res = {} 261 for obj_name, expected_data in joined_file_data.items(): 262 expected_res[obj_name] = expected_data 263 self._verify_obj_res(expected_res) 264 265 def test_put_files_filtered(self): 266 self.local_test_files.mkdir() 267 included_filename = "prefix-file.txt" 268 excluded_by_pattern = "extra_top_file.py" 269 excluded_by_prefix = "non-prefix-file.txt" 270 for file in [included_filename, excluded_by_pattern, excluded_by_prefix]: 271 with open(self.local_test_files.joinpath(file), "wb"): 272 pass 273 self.bucket.put_files( 274 self.local_test_files, 275 prepend=self.obj_prefix, 276 prefix_filter=PREFIX_NAME, 277 pattern="*.txt", 278 ) 279 self.bucket.object(self.obj_prefix + included_filename).get() 280 with self.assertRaises(AISError): 281 self.bucket.object(excluded_by_pattern).get() 282 with self.assertRaises(AISError): 283 self.bucket.object(excluded_by_prefix).get() 284 285 def test_put_files_dry_run(self): 286 self._create_put_files_structure(TOP_LEVEL_FILES, LOWER_LEVEL_FILES) 287 self.bucket.put_files( 288 self.local_test_files, dry_run=True, prepend=self.obj_prefix 289 ) 290 # Verify the put files call does not actually create objects 291 self._verify_obj_res(TOP_LEVEL_FILES, expect_err=True) 292 293 @test_cases((None, OBJECT_COUNT), (7, 7), (OBJECT_COUNT * 2, OBJECT_COUNT)) 294 def test_list_objects(self, test_case): 295 page_size, response_size = test_case 296 # Only create the bucket entries on the first subtest run 297 if len(self.bucket.list_all_objects(prefix=self.obj_prefix)) == 0: 298 self._create_objects() 299 if page_size: 300 resp = self.bucket.list_objects(page_size=page_size, prefix=self.obj_prefix) 301 else: 302 resp = self.bucket.list_objects(prefix=self.obj_prefix) 303 self.assertEqual(response_size, len(resp.entries)) 304 305 def test_list_all_objects(self): 306 short_page_len = 17 307 self._create_objects() 308 objects = self.bucket.list_all_objects(prefix=self.obj_prefix) 309 self.assertEqual(OBJECT_COUNT, len(objects)) 310 objects = self.bucket.list_all_objects( 311 page_size=short_page_len, prefix=self.obj_prefix 312 ) 313 self.assertEqual(OBJECT_COUNT, len(objects)) 314 315 def test_list_object_iter(self): 316 obj_names = set(self._create_objects()) 317 318 # Empty iterator if there are no objects matching the prefix. 319 obj_iter = self.bucket.list_objects_iter(prefix="invalid-obj-") 320 self.assertEqual(0, len(list(obj_iter))) 321 322 # Read all `bucket_size` objects by prefix. 323 obj_iter = self.bucket.list_objects_iter(page_size=10, prefix=self.obj_prefix) 324 for obj in obj_iter: 325 obj_names.remove(obj.name) 326 self.assertEqual(0, len(obj_names)) 327 328 def test_list_object_flags(self): 329 self._create_objects() 330 objects = self.bucket.list_all_objects( 331 flags=[ListObjectFlag.NAME_ONLY, ListObjectFlag.CACHED], 332 prefix=self.obj_prefix, 333 ) 334 self.assertEqual(OBJECT_COUNT, len(objects)) 335 for obj in objects: 336 self.assertEqual(0, obj.size) 337 338 objects = self.bucket.list_all_objects( 339 flags=[ListObjectFlag.NAME_SIZE], prefix=self.obj_prefix 340 ) 341 self.assertEqual(OBJECT_COUNT, len(objects)) 342 for obj in objects: 343 self.assertTrue(obj.size > 0) 344 345 def test_summary(self): 346 summ_test_bck = self._create_bucket("summary-test") 347 348 # Initially, the bucket should be empty 349 bucket_summary = summ_test_bck.summary() 350 351 self.assertEqual(bucket_summary["ObjCount"]["obj_count_present"], "0") 352 self.assertEqual(bucket_summary["TotalSize"]["size_all_present_objs"], "0") 353 self.assertEqual(bucket_summary["TotalSize"]["size_all_remote_objs"], "0") 354 self.assertEqual(bucket_summary["used_pct"], 0) 355 356 summ_test_bck.object(OBJ_NAME).put_content(OBJ_CONTENT) 357 358 bucket_summary = summ_test_bck.summary() 359 360 # Now, the bucket should have 1 object 361 self.assertEqual(bucket_summary["ObjCount"]["obj_count_present"], "1") 362 self.assertNotEqual(bucket_summary["TotalSize"]["size_all_present_objs"], "0") 363 364 summ_test_bck.delete() 365 366 # Accessing the summary of a deleted bucket should raise an error 367 with self.assertRaises(ErrBckNotFound): 368 summ_test_bck.summary() 369 370 def test_info(self): 371 info_test_bck = self._create_bucket("info-test") 372 373 # Initially, the bucket should be empty 374 _, bck_summ = info_test_bck.info(flt_presence=0) 375 376 # For an empty bucket, the object count and total size should be zero 377 self.assertEqual(bck_summ["ObjCount"]["obj_count_present"], "0") 378 self.assertEqual(bck_summ["TotalSize"]["size_all_present_objs"], "0") 379 self.assertEqual(bck_summ["TotalSize"]["size_all_remote_objs"], "0") 380 self.assertEqual(bck_summ["provider"], "ais") 381 self.assertEqual(bck_summ["name"], "info-test") 382 383 # Upload an object to the bucket 384 info_test_bck.object(OBJ_NAME).put_content(OBJ_CONTENT) 385 386 _, bck_summ = info_test_bck.info() 387 388 # Now the bucket should have one object and non-zero size 389 self.assertEqual(bck_summ["ObjCount"]["obj_count_present"], "1") 390 self.assertNotEqual(bck_summ["TotalSize"]["size_all_present_objs"], "0") 391 self.assertEqual(bck_summ["TotalSize"]["size_all_remote_objs"], "0") 392 self.assertEqual(bck_summ["provider"], "ais") 393 self.assertEqual(bck_summ["name"], "info-test") 394 395 info_test_bck.delete() 396 397 # Accessing the info of a deleted bucket should raise an error 398 with self.assertRaises(ErrBckNotFound): 399 info_test_bck.summary() 400 401 def test_write_dataset(self): 402 self.local_test_files.mkdir(exist_ok=True) 403 dataset_directory = self.local_test_files.joinpath(DATASET_DIR) 404 dataset_directory.mkdir(exist_ok=True) 405 img_files = { 406 "file1.jpg": b"file1", 407 "file2.jpg": b"file2", 408 "file3.jpg": b"file3", 409 } 410 _create_files(dataset_directory, img_files) 411 412 dataset_config = DatasetConfig( 413 primary_attribute=DataAttribute( 414 path=dataset_directory, name="image", file_type="jpg" 415 ), 416 secondary_attributes=[ 417 LabelAttribute( 418 name="label", label_identifier=lambda filename: f"{filename}_label" 419 ) 420 ], 421 ) 422 shards = [] 423 424 def post_process(shard_path): 425 self._register_for_post_test_cleanup(names=[shard_path], is_bucket=False) 426 shards.append(shard_path) 427 428 self.bucket.write_dataset( 429 dataset_config, pattern="dataset", maxcount=10, post=post_process 430 ) 431 self.assertEqual(len(shards), 1) 432 for shard in shards: 433 self.assertIsNotNone(self.bucket.object(shard).head()) 434 435 def test_write_dataset_missing_attributes(self): 436 self.local_test_files.mkdir(exist_ok=True) 437 dataset_directory = self.local_test_files.joinpath(DATASET_DIR) 438 dataset_directory.mkdir(exist_ok=True) 439 img_files = { 440 "file1.jpg": b"file1", 441 "file2.jpg": b"file2", 442 "file3.jpg": b"file3", 443 } 444 _create_files(dataset_directory, img_files) 445 446 dataset_config = DatasetConfig( 447 primary_attribute=DataAttribute( 448 path=dataset_directory, name="image", file_type="jpg" 449 ), 450 secondary_attributes=[ 451 LabelAttribute(name="cls", label_identifier=lambda filename: None) 452 ], 453 ) 454 self.bucket.write_dataset( 455 dataset_config, skip_missing=False, pattern="dataset", maxcount=10 456 )