github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/python/tests/integration/sdk/test_dsort_ops.py (about)

     1  import io
     2  import json
     3  import shutil
     4  import tarfile
     5  import unittest
     6  from pathlib import Path
     7  
     8  from aistore import Client
     9  from tests.integration import CLUSTER_ENDPOINT
    10  from tests.const import TEST_TIMEOUT
    11  from tests.utils import test_cases, random_string
    12  
    13  
    14  class TestDsortOps(unittest.TestCase):
    15      def setUp(self) -> None:
    16          self.client = Client(CLUSTER_ENDPOINT)
    17          self.temp_dir = Path("tmp")
    18          try:
    19              self.temp_dir.mkdir()
    20          except FileExistsError:
    21              shutil.rmtree(self.temp_dir)
    22              self.temp_dir.mkdir()
    23          self.buckets = []
    24  
    25      def tearDown(self) -> None:
    26          shutil.rmtree(self.temp_dir)
    27          for bucket in self.buckets:
    28              self.client.bucket(bucket).delete(missing_ok=True)
    29  
    30      def _upload_dir(self, dir_name, bck_name):
    31          bck = self.client.bucket(bck_name).create(exist_ok=True)
    32          self.buckets.append(bck_name)
    33          bck.put_files(dir_name)
    34  
    35      @staticmethod
    36      def _generate_tar(filename, prefix, tar_format, num_files):
    37          with tarfile.open(filename, "w|", format=tar_format) as tar:
    38              for i in range(num_files):
    39                  # Create a file name and write random text to it
    40                  filename = f"shard-{prefix}-file-{i}.txt"
    41                  with open(filename, "w", encoding="utf-8") as text:
    42                      text.write(random_string())
    43                  # Add the file to the tarfile
    44                  tar.add(filename)
    45                  # Remove the file after adding it to the tarfile
    46                  Path(filename).unlink()
    47  
    48      def _generate_shards(self, tar_type, tar_enum, num_shards, num_files):
    49          shard_names = []
    50          out_dir = Path(self.temp_dir).joinpath(tar_type)
    51          out_dir.mkdir(exist_ok=True)
    52          for shard_index in range(num_shards):
    53              name = f"{tar_type}-{shard_index}.tar"
    54              filename = out_dir.joinpath(name)
    55              self._generate_tar(filename, shard_index, tar_enum, num_files)
    56              shard_names.append(name)
    57          self._upload_dir(out_dir, tar_type)
    58          return shard_names
    59  
    60      def _get_object_content_map(self, bucket_name, object_names):
    61          expected_contents = {}
    62          for obj in object_names:
    63              output_bytes = self.client.bucket(bucket_name).object(obj).get().read_all()
    64              output = io.BytesIO(output_bytes)
    65              with tarfile.open(fileobj=output) as result_tar:
    66                  for tar in result_tar:
    67                      expected_contents[tar.name] = result_tar.extractfile(
    68                          tar.name
    69                      ).read()
    70          return expected_contents
    71  
    72      def _start_with_spec(self, input_bck_name, out_bck_name, input_object_prefix):
    73          spec = {
    74              "input_extension": ".tar",
    75              "input_bck": {"name": input_bck_name},
    76              "output_bck": {"name": out_bck_name},
    77              "input_format": {"template": input_object_prefix + "-{0..1}"},
    78              "output_format": "out-shard-{0..9}",
    79              "output_shard_size": "20MB",
    80              "description": "Dsort Integration Test",
    81          }
    82          spec_file = self.temp_dir.joinpath("spec.json")
    83          with open(spec_file, "w", encoding="utf-8") as outfile:
    84              outfile.write(json.dumps(spec, indent=4))
    85          dsort = self.client.dsort()
    86          dsort.start(spec_file)
    87          return dsort
    88  
    89      @test_cases(("gnu", tarfile.GNU_FORMAT, 2, 3), ("pax", tarfile.PAX_FORMAT, 2, 3))
    90      def test_dsort(self, test_case):
    91          tar_type, tar_format, num_shards, num_files = test_case
    92          # create bucket for output
    93          out_bck_name = tar_type + "-out"
    94          self.client.bucket(out_bck_name).create(exist_ok=True)
    95          self.buckets.append(out_bck_name)
    96          # create tars as objects in buckets
    97          shards = self._generate_shards(tar_type, tar_format, num_shards, num_files)
    98          # Read created objects to get expected output after dsort
    99          expected_contents = self._get_object_content_map(
   100              bucket_name=tar_type, object_names=shards
   101          )
   102          dsort = self._start_with_spec(
   103              input_bck_name=tar_type,
   104              out_bck_name=out_bck_name,
   105              input_object_prefix=tar_type,
   106          )
   107          dsort.wait(timeout=TEST_TIMEOUT)
   108          output_bytes = (
   109              self.client.bucket(out_bck_name).object("out-shard-0.tar").get().read_all()
   110          )
   111          output = io.BytesIO(output_bytes)
   112          result_contents = {}
   113          with tarfile.open(fileobj=output) as result_tar:
   114              for tar in result_tar:
   115                  result_contents[tar.name] = result_tar.extractfile(tar.name).read()
   116  
   117          self.assertEqual(expected_contents, result_contents)
   118  
   119      def test_abort(self):
   120          input_bck_name = "abort"
   121          out_bck_name = "out"
   122          self.client.bucket(input_bck_name).create(exist_ok=True)
   123          self.buckets.append(input_bck_name)
   124          self.client.bucket(out_bck_name).create(exist_ok=True)
   125          self.buckets.append(out_bck_name)
   126          # Create enough files to make the dSort job slow enough to abort
   127          self._generate_shards(input_bck_name, tarfile.GNU_FORMAT, 10, 1000)
   128          dsort = self._start_with_spec(
   129              input_bck_name=input_bck_name,
   130              out_bck_name=out_bck_name,
   131              input_object_prefix=input_bck_name,
   132          )
   133          dsort.abort()
   134          dsort.wait(timeout=TEST_TIMEOUT)
   135          for job_info in dsort.get_job_info().values():
   136              self.assertTrue(job_info.metrics.aborted)
   137              self.assertEqual(1, len(job_info.metrics.errors))