github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/gcsio_integration_test.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Integration tests for gcsio module. 19 20 Runs tests against Google Cloud Storage service. 21 Instantiates a TestPipeline to get options such as GCP project name, but 22 doesn't actually start a Beam pipeline or test any specific runner. 23 24 Options: 25 --kms_key_name=projects/<project-name>/locations/<region>/keyRings/\ 26 <key-ring-name>/cryptoKeys/<key-name>/cryptoKeyVersions/<version> 27 Pass a Cloud KMS key name to test GCS operations using customer managed 28 encryption keys (CMEK). 29 30 Cloud KMS permissions: 31 The project's Cloud Storage service account requires Encrypter/Decrypter 32 permissions for the key specified in --kms_key_name. 33 34 To run these tests manually: 35 ./gradlew :sdks:python:test-suites:dataflow:integrationTest \ 36 -Dtests=apache_beam.io.gcp.gcsio_integration_test:GcsIOIntegrationTest \ 37 -DkmsKeyName=KMS_KEY_NAME 38 """ 39 40 # pytype: skip-file 41 42 import logging 43 import unittest 44 import uuid 45 46 import pytest 47 48 from apache_beam.io.filesystems import FileSystems 49 from apache_beam.testing.test_pipeline import TestPipeline 50 51 try: 52 from apache_beam.io.gcp import gcsio 53 except ImportError: 54 gcsio = None # type: ignore 55 56 57 @unittest.skipIf(gcsio is None, 'GCP dependencies are not installed') 58 class GcsIOIntegrationTest(unittest.TestCase): 59 60 INPUT_FILE = 'gs://dataflow-samples/shakespeare/kinglear.txt' 61 # Larger than 1MB to test maxBytesRewrittenPerCall. 62 # Also needs to be in a different region than the dest to take effect. 63 INPUT_FILE_LARGE = 'gs://apache-beam-samples-us-east1/wikipedia_edits/wiki_data-000000000000.json' # pylint: disable=line-too-long 64 65 def setUp(self): 66 self.test_pipeline = TestPipeline(is_integration_test=True) 67 self.runner_name = type(self.test_pipeline.runner).__name__ 68 if self.runner_name != 'TestDataflowRunner': 69 # This test doesn't run a pipeline, so it doesn't make sense to try it on 70 # different runners. Running with TestDataflowRunner makes sense since 71 # it uses GoogleCloudOptions such as 'project'. 72 raise unittest.SkipTest('This test only runs with TestDataflowRunner.') 73 self.project = self.test_pipeline.get_option('project') 74 self.gcs_tempdir = ( 75 self.test_pipeline.get_option('temp_location') + '/gcs_it-' + 76 str(uuid.uuid4())) 77 self.kms_key_name = self.test_pipeline.get_option('kms_key_name') 78 self.gcsio = gcsio.GcsIO() 79 80 def tearDown(self): 81 FileSystems.delete([self.gcs_tempdir + '/']) 82 83 def _verify_copy(self, src, dst, dst_kms_key_name=None): 84 self.assertTrue(FileSystems.exists(src), 'src does not exist: %s' % src) 85 self.assertTrue(FileSystems.exists(dst), 'dst does not exist: %s' % dst) 86 src_checksum = self.gcsio.checksum(src) 87 dst_checksum = self.gcsio.checksum(dst) 88 self.assertEqual(src_checksum, dst_checksum) 89 actual_dst_kms_key = self.gcsio.kms_key(dst) 90 if actual_dst_kms_key is None: 91 self.assertEqual(actual_dst_kms_key, dst_kms_key_name) 92 else: 93 self.assertTrue( 94 actual_dst_kms_key.startswith(dst_kms_key_name), 95 "got: %s, wanted startswith: %s" % 96 (actual_dst_kms_key, dst_kms_key_name)) 97 98 def _test_copy( 99 self, 100 name, 101 kms_key_name=None, 102 max_bytes_rewritten_per_call=None, 103 src=None): 104 src = src or self.INPUT_FILE 105 dst = self.gcs_tempdir + '/%s' % name 106 extra_kwargs = {} 107 if max_bytes_rewritten_per_call is not None: 108 extra_kwargs['max_bytes_rewritten_per_call'] = ( 109 max_bytes_rewritten_per_call) 110 111 self.gcsio.copy(src, dst, kms_key_name, **extra_kwargs) 112 self._verify_copy(src, dst, kms_key_name) 113 114 @pytest.mark.it_postcommit 115 def test_copy(self): 116 self._test_copy("test_copy") 117 118 @pytest.mark.it_postcommit 119 def test_copy_kms(self): 120 if self.kms_key_name is None: 121 raise unittest.SkipTest('--kms_key_name not specified') 122 self._test_copy("test_copy_kms", self.kms_key_name) 123 124 @pytest.mark.it_postcommit 125 def test_copy_rewrite_token(self): 126 # Tests a multi-part copy (rewrite) operation. This is triggered by a 127 # combination of 3 conditions: 128 # - a large enough src 129 # - setting max_bytes_rewritten_per_call 130 # - setting kms_key_name 131 if self.kms_key_name is None: 132 raise unittest.SkipTest('--kms_key_name not specified') 133 134 rewrite_responses = [] 135 self.gcsio._set_rewrite_response_callback( 136 lambda response: rewrite_responses.append(response)) 137 self._test_copy( 138 "test_copy_rewrite_token", 139 kms_key_name=self.kms_key_name, 140 max_bytes_rewritten_per_call=50 * 1024 * 1024, 141 src=self.INPUT_FILE_LARGE) 142 # Verify that there was a multi-part rewrite. 143 self.assertTrue(any(not r.done for r in rewrite_responses)) 144 145 def _test_copy_batch( 146 self, 147 name, 148 kms_key_name=None, 149 max_bytes_rewritten_per_call=None, 150 src=None): 151 num_copies = 10 152 srcs = [src or self.INPUT_FILE] * num_copies 153 dsts = [self.gcs_tempdir + '/%s_%d' % (name, i) for i in range(num_copies)] 154 src_dst_pairs = list(zip(srcs, dsts)) 155 extra_kwargs = {} 156 if max_bytes_rewritten_per_call is not None: 157 extra_kwargs['max_bytes_rewritten_per_call'] = ( 158 max_bytes_rewritten_per_call) 159 160 result_statuses = self.gcsio.copy_batch( 161 src_dst_pairs, kms_key_name, **extra_kwargs) 162 for status in result_statuses: 163 self.assertIsNone(status[2], status) 164 for _src, _dst in src_dst_pairs: 165 self._verify_copy(_src, _dst, kms_key_name) 166 167 @pytest.mark.it_postcommit 168 def test_copy_batch(self): 169 self._test_copy_batch("test_copy_batch") 170 171 @pytest.mark.it_postcommit 172 def test_copy_batch_kms(self): 173 if self.kms_key_name is None: 174 raise unittest.SkipTest('--kms_key_name not specified') 175 self._test_copy_batch("test_copy_batch_kms", self.kms_key_name) 176 177 @pytest.mark.it_postcommit 178 def test_copy_batch_rewrite_token(self): 179 # Tests a multi-part copy (rewrite) operation. This is triggered by a 180 # combination of 3 conditions: 181 # - a large enough src 182 # - setting max_bytes_rewritten_per_call 183 # - setting kms_key_name 184 if self.kms_key_name is None: 185 raise unittest.SkipTest('--kms_key_name not specified') 186 187 rewrite_responses = [] 188 self.gcsio._set_rewrite_response_callback( 189 lambda response: rewrite_responses.append(response)) 190 self._test_copy_batch( 191 "test_copy_batch_rewrite_token", 192 kms_key_name=self.kms_key_name, 193 max_bytes_rewritten_per_call=50 * 1024 * 1024, 194 src=self.INPUT_FILE_LARGE) 195 # Verify that there was a multi-part rewrite. 196 self.assertTrue(any(not r.done for r in rewrite_responses)) 197 198 199 if __name__ == '__main__': 200 logging.getLogger().setLevel(logging.INFO) 201 unittest.main()