github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/python/tests/integration/sdk/test_dsort_ops.py (about) 1 import io 2 import json 3 import shutil 4 import tarfile 5 import unittest 6 from pathlib import Path 7 8 from aistore import Client 9 from tests.integration import CLUSTER_ENDPOINT 10 from tests.const import TEST_TIMEOUT 11 from tests.utils import test_cases, random_string 12 13 14 class TestDsortOps(unittest.TestCase): 15 def setUp(self) -> None: 16 self.client = Client(CLUSTER_ENDPOINT) 17 self.temp_dir = Path("tmp") 18 try: 19 self.temp_dir.mkdir() 20 except FileExistsError: 21 shutil.rmtree(self.temp_dir) 22 self.temp_dir.mkdir() 23 self.buckets = [] 24 25 def tearDown(self) -> None: 26 shutil.rmtree(self.temp_dir) 27 for bucket in self.buckets: 28 self.client.bucket(bucket).delete(missing_ok=True) 29 30 def _upload_dir(self, dir_name, bck_name): 31 bck = self.client.bucket(bck_name).create(exist_ok=True) 32 self.buckets.append(bck_name) 33 bck.put_files(dir_name) 34 35 @staticmethod 36 def _generate_tar(filename, prefix, tar_format, num_files): 37 with tarfile.open(filename, "w|", format=tar_format) as tar: 38 for i in range(num_files): 39 # Create a file name and write random text to it 40 filename = f"shard-{prefix}-file-{i}.txt" 41 with open(filename, "w", encoding="utf-8") as text: 42 text.write(random_string()) 43 # Add the file to the tarfile 44 tar.add(filename) 45 # Remove the file after adding it to the tarfile 46 Path(filename).unlink() 47 48 def _generate_shards(self, tar_type, tar_enum, num_shards, num_files): 49 shard_names = [] 50 out_dir = Path(self.temp_dir).joinpath(tar_type) 51 out_dir.mkdir(exist_ok=True) 52 for shard_index in range(num_shards): 53 name = f"{tar_type}-{shard_index}.tar" 54 filename = out_dir.joinpath(name) 55 self._generate_tar(filename, shard_index, tar_enum, num_files) 56 shard_names.append(name) 57 self._upload_dir(out_dir, tar_type) 58 return shard_names 59 60 def _get_object_content_map(self, bucket_name, object_names): 61 expected_contents = {} 62 for obj in object_names: 63 output_bytes = self.client.bucket(bucket_name).object(obj).get().read_all() 64 output = io.BytesIO(output_bytes) 65 with tarfile.open(fileobj=output) as result_tar: 66 for tar in result_tar: 67 expected_contents[tar.name] = result_tar.extractfile( 68 tar.name 69 ).read() 70 return expected_contents 71 72 def _start_with_spec(self, input_bck_name, out_bck_name, input_object_prefix): 73 spec = { 74 "input_extension": ".tar", 75 "input_bck": {"name": input_bck_name}, 76 "output_bck": {"name": out_bck_name}, 77 "input_format": {"template": input_object_prefix + "-{0..1}"}, 78 "output_format": "out-shard-{0..9}", 79 "output_shard_size": "20MB", 80 "description": "Dsort Integration Test", 81 } 82 spec_file = self.temp_dir.joinpath("spec.json") 83 with open(spec_file, "w", encoding="utf-8") as outfile: 84 outfile.write(json.dumps(spec, indent=4)) 85 dsort = self.client.dsort() 86 dsort.start(spec_file) 87 return dsort 88 89 @test_cases(("gnu", tarfile.GNU_FORMAT, 2, 3), ("pax", tarfile.PAX_FORMAT, 2, 3)) 90 def test_dsort(self, test_case): 91 tar_type, tar_format, num_shards, num_files = test_case 92 # create bucket for output 93 out_bck_name = tar_type + "-out" 94 self.client.bucket(out_bck_name).create(exist_ok=True) 95 self.buckets.append(out_bck_name) 96 # create tars as objects in buckets 97 shards = self._generate_shards(tar_type, tar_format, num_shards, num_files) 98 # Read created objects to get expected output after dsort 99 expected_contents = self._get_object_content_map( 100 bucket_name=tar_type, object_names=shards 101 ) 102 dsort = self._start_with_spec( 103 input_bck_name=tar_type, 104 out_bck_name=out_bck_name, 105 input_object_prefix=tar_type, 106 ) 107 dsort.wait(timeout=TEST_TIMEOUT) 108 output_bytes = ( 109 self.client.bucket(out_bck_name).object("out-shard-0.tar").get().read_all() 110 ) 111 output = io.BytesIO(output_bytes) 112 result_contents = {} 113 with tarfile.open(fileobj=output) as result_tar: 114 for tar in result_tar: 115 result_contents[tar.name] = result_tar.extractfile(tar.name).read() 116 117 self.assertEqual(expected_contents, result_contents) 118 119 def test_abort(self): 120 input_bck_name = "abort" 121 out_bck_name = "out" 122 self.client.bucket(input_bck_name).create(exist_ok=True) 123 self.buckets.append(input_bck_name) 124 self.client.bucket(out_bck_name).create(exist_ok=True) 125 self.buckets.append(out_bck_name) 126 # Create enough files to make the dSort job slow enough to abort 127 self._generate_shards(input_bck_name, tarfile.GNU_FORMAT, 10, 1000) 128 dsort = self._start_with_spec( 129 input_bck_name=input_bck_name, 130 out_bck_name=out_bck_name, 131 input_object_prefix=input_bck_name, 132 ) 133 dsort.abort() 134 dsort.wait(timeout=TEST_TIMEOUT) 135 for job_info in dsort.get_job_info().values(): 136 self.assertTrue(job_info.metrics.aborted) 137 self.assertEqual(1, len(job_info.metrics.errors))