github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/gcsio_integration_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Integration tests for gcsio module.
    19  
    20  Runs tests against Google Cloud Storage service.
    21  Instantiates a TestPipeline to get options such as GCP project name, but
    22  doesn't actually start a Beam pipeline or test any specific runner.
    23  
    24  Options:
    25    --kms_key_name=projects/<project-name>/locations/<region>/keyRings/\
    26        <key-ring-name>/cryptoKeys/<key-name>/cryptoKeyVersions/<version>
    27      Pass a Cloud KMS key name to test GCS operations using customer managed
    28      encryption keys (CMEK).
    29  
    30  Cloud KMS permissions:
    31  The project's Cloud Storage service account requires Encrypter/Decrypter
    32  permissions for the key specified in --kms_key_name.
    33  
    34  To run these tests manually:
    35    ./gradlew :sdks:python:test-suites:dataflow:integrationTest \
    36      -Dtests=apache_beam.io.gcp.gcsio_integration_test:GcsIOIntegrationTest \
    37      -DkmsKeyName=KMS_KEY_NAME
    38  """
    39  
    40  # pytype: skip-file
    41  
    42  import logging
    43  import unittest
    44  import uuid
    45  
    46  import pytest
    47  
    48  from apache_beam.io.filesystems import FileSystems
    49  from apache_beam.testing.test_pipeline import TestPipeline
    50  
    51  try:
    52    from apache_beam.io.gcp import gcsio
    53  except ImportError:
    54    gcsio = None  # type: ignore
    55  
    56  
    57  @unittest.skipIf(gcsio is None, 'GCP dependencies are not installed')
    58  class GcsIOIntegrationTest(unittest.TestCase):
    59  
    60    INPUT_FILE = 'gs://dataflow-samples/shakespeare/kinglear.txt'
    61    # Larger than 1MB to test maxBytesRewrittenPerCall.
    62    # Also needs to be in a different region than the dest to take effect.
    63    INPUT_FILE_LARGE = 'gs://apache-beam-samples-us-east1/wikipedia_edits/wiki_data-000000000000.json'  # pylint: disable=line-too-long
    64  
    65    def setUp(self):
    66      self.test_pipeline = TestPipeline(is_integration_test=True)
    67      self.runner_name = type(self.test_pipeline.runner).__name__
    68      if self.runner_name != 'TestDataflowRunner':
    69        # This test doesn't run a pipeline, so it doesn't make sense to try it on
    70        # different runners. Running with TestDataflowRunner makes sense since
    71        # it uses GoogleCloudOptions such as 'project'.
    72        raise unittest.SkipTest('This test only runs with TestDataflowRunner.')
    73      self.project = self.test_pipeline.get_option('project')
    74      self.gcs_tempdir = (
    75          self.test_pipeline.get_option('temp_location') + '/gcs_it-' +
    76          str(uuid.uuid4()))
    77      self.kms_key_name = self.test_pipeline.get_option('kms_key_name')
    78      self.gcsio = gcsio.GcsIO()
    79  
    80    def tearDown(self):
    81      FileSystems.delete([self.gcs_tempdir + '/'])
    82  
    83    def _verify_copy(self, src, dst, dst_kms_key_name=None):
    84      self.assertTrue(FileSystems.exists(src), 'src does not exist: %s' % src)
    85      self.assertTrue(FileSystems.exists(dst), 'dst does not exist: %s' % dst)
    86      src_checksum = self.gcsio.checksum(src)
    87      dst_checksum = self.gcsio.checksum(dst)
    88      self.assertEqual(src_checksum, dst_checksum)
    89      actual_dst_kms_key = self.gcsio.kms_key(dst)
    90      if actual_dst_kms_key is None:
    91        self.assertEqual(actual_dst_kms_key, dst_kms_key_name)
    92      else:
    93        self.assertTrue(
    94            actual_dst_kms_key.startswith(dst_kms_key_name),
    95            "got: %s, wanted startswith: %s" %
    96            (actual_dst_kms_key, dst_kms_key_name))
    97  
    98    def _test_copy(
    99        self,
   100        name,
   101        kms_key_name=None,
   102        max_bytes_rewritten_per_call=None,
   103        src=None):
   104      src = src or self.INPUT_FILE
   105      dst = self.gcs_tempdir + '/%s' % name
   106      extra_kwargs = {}
   107      if max_bytes_rewritten_per_call is not None:
   108        extra_kwargs['max_bytes_rewritten_per_call'] = (
   109            max_bytes_rewritten_per_call)
   110  
   111      self.gcsio.copy(src, dst, kms_key_name, **extra_kwargs)
   112      self._verify_copy(src, dst, kms_key_name)
   113  
   114    @pytest.mark.it_postcommit
   115    def test_copy(self):
   116      self._test_copy("test_copy")
   117  
   118    @pytest.mark.it_postcommit
   119    def test_copy_kms(self):
   120      if self.kms_key_name is None:
   121        raise unittest.SkipTest('--kms_key_name not specified')
   122      self._test_copy("test_copy_kms", self.kms_key_name)
   123  
   124    @pytest.mark.it_postcommit
   125    def test_copy_rewrite_token(self):
   126      # Tests a multi-part copy (rewrite) operation. This is triggered by a
   127      # combination of 3 conditions:
   128      #  - a large enough src
   129      #  - setting max_bytes_rewritten_per_call
   130      #  - setting kms_key_name
   131      if self.kms_key_name is None:
   132        raise unittest.SkipTest('--kms_key_name not specified')
   133  
   134      rewrite_responses = []
   135      self.gcsio._set_rewrite_response_callback(
   136          lambda response: rewrite_responses.append(response))
   137      self._test_copy(
   138          "test_copy_rewrite_token",
   139          kms_key_name=self.kms_key_name,
   140          max_bytes_rewritten_per_call=50 * 1024 * 1024,
   141          src=self.INPUT_FILE_LARGE)
   142      # Verify that there was a multi-part rewrite.
   143      self.assertTrue(any(not r.done for r in rewrite_responses))
   144  
   145    def _test_copy_batch(
   146        self,
   147        name,
   148        kms_key_name=None,
   149        max_bytes_rewritten_per_call=None,
   150        src=None):
   151      num_copies = 10
   152      srcs = [src or self.INPUT_FILE] * num_copies
   153      dsts = [self.gcs_tempdir + '/%s_%d' % (name, i) for i in range(num_copies)]
   154      src_dst_pairs = list(zip(srcs, dsts))
   155      extra_kwargs = {}
   156      if max_bytes_rewritten_per_call is not None:
   157        extra_kwargs['max_bytes_rewritten_per_call'] = (
   158            max_bytes_rewritten_per_call)
   159  
   160      result_statuses = self.gcsio.copy_batch(
   161          src_dst_pairs, kms_key_name, **extra_kwargs)
   162      for status in result_statuses:
   163        self.assertIsNone(status[2], status)
   164      for _src, _dst in src_dst_pairs:
   165        self._verify_copy(_src, _dst, kms_key_name)
   166  
   167    @pytest.mark.it_postcommit
   168    def test_copy_batch(self):
   169      self._test_copy_batch("test_copy_batch")
   170  
   171    @pytest.mark.it_postcommit
   172    def test_copy_batch_kms(self):
   173      if self.kms_key_name is None:
   174        raise unittest.SkipTest('--kms_key_name not specified')
   175      self._test_copy_batch("test_copy_batch_kms", self.kms_key_name)
   176  
   177    @pytest.mark.it_postcommit
   178    def test_copy_batch_rewrite_token(self):
   179      # Tests a multi-part copy (rewrite) operation. This is triggered by a
   180      # combination of 3 conditions:
   181      #  - a large enough src
   182      #  - setting max_bytes_rewritten_per_call
   183      #  - setting kms_key_name
   184      if self.kms_key_name is None:
   185        raise unittest.SkipTest('--kms_key_name not specified')
   186  
   187      rewrite_responses = []
   188      self.gcsio._set_rewrite_response_callback(
   189          lambda response: rewrite_responses.append(response))
   190      self._test_copy_batch(
   191          "test_copy_batch_rewrite_token",
   192          kms_key_name=self.kms_key_name,
   193          max_bytes_rewritten_per_call=50 * 1024 * 1024,
   194          src=self.INPUT_FILE_LARGE)
   195      # Verify that there was a multi-part rewrite.
   196      self.assertTrue(any(not r.done for r in rewrite_responses))
   197  
   198  
   199  if __name__ == '__main__':
   200    logging.getLogger().setLevel(logging.INFO)
   201    unittest.main()