github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/gcsio_test.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Tests for Google Cloud Storage client.""" 19 # pytype: skip-file 20 21 import datetime 22 import errno 23 import io 24 import logging 25 import os 26 import random 27 import time 28 import unittest 29 from email.message import Message 30 31 import httplib2 32 import mock 33 34 # Protect against environments where apitools library is not available. 35 # pylint: disable=wrong-import-order, wrong-import-position 36 from apache_beam.metrics import monitoring_infos 37 from apache_beam.metrics.execution import MetricsEnvironment 38 from apache_beam.metrics.metricbase import MetricName 39 40 try: 41 from apache_beam.io.gcp import gcsio, resource_identifiers 42 from apache_beam.io.gcp.internal.clients import storage 43 from apitools.base.py.exceptions import HttpError 44 except ImportError: 45 HttpError = None 46 # pylint: enable=wrong-import-order, wrong-import-position 47 48 DEFAULT_GCP_PROJECT = 'apache-beam-testing' 49 DEFAULT_PROJECT_NUMBER = 1 50 51 52 class FakeGcsClient(object): 53 # Fake storage client. Usage in gcsio.py is client.objects.Get(...) and 54 # client.objects.Insert(...). 55 56 def __init__(self): 57 self.objects = FakeGcsObjects() 58 self.buckets = FakeGcsBuckets() 59 # Referenced in GcsIO.copy_batch() and GcsIO.delete_batch(). 60 self._http = object() 61 62 63 class FakeFile(object): 64 def __init__( 65 self, bucket, obj, contents, generation, crc32c=None, last_updated=None): 66 self.bucket = bucket 67 self.object = obj 68 self.contents = contents 69 self.generation = generation 70 self.crc32c = crc32c 71 self.last_updated = last_updated 72 73 def get_metadata(self): 74 last_updated_datetime = None 75 if self.last_updated: 76 last_updated_datetime = datetime.datetime.utcfromtimestamp( 77 self.last_updated) 78 79 return storage.Object( 80 bucket=self.bucket, 81 name=self.object, 82 generation=self.generation, 83 size=len(self.contents), 84 crc32c=self.crc32c, 85 updated=last_updated_datetime) 86 87 88 class FakeGcsBuckets(object): 89 def __init__(self): 90 pass 91 92 def get_bucket(self, bucket): 93 return storage.Bucket(name=bucket, projectNumber=DEFAULT_PROJECT_NUMBER) 94 95 def Get(self, get_request): 96 return self.get_bucket(get_request.bucket) 97 98 99 class FakeGcsObjects(object): 100 def __init__(self): 101 self.files = {} 102 # Store the last generation used for a given object name. Note that this 103 # has to persist even past the deletion of the object. 104 self.last_generation = {} 105 self.list_page_tokens = {} 106 self._fail_when_getting_metadata = [] 107 self._fail_when_reading = [] 108 109 def add_file( 110 self, f, fail_when_getting_metadata=False, fail_when_reading=False): 111 self.files[(f.bucket, f.object)] = f 112 self.last_generation[(f.bucket, f.object)] = f.generation 113 if fail_when_getting_metadata: 114 self._fail_when_getting_metadata.append(f) 115 if fail_when_reading: 116 self._fail_when_reading.append(f) 117 118 def get_file(self, bucket, obj): 119 return self.files.get((bucket, obj), None) 120 121 def delete_file(self, bucket, obj): 122 del self.files[(bucket, obj)] 123 124 def get_last_generation(self, bucket, obj): 125 return self.last_generation.get((bucket, obj), 0) 126 127 def Get(self, get_request, download=None): # pylint: disable=invalid-name 128 f = self.get_file(get_request.bucket, get_request.object) 129 if f is None: 130 # Failing with an HTTP 404 if file does not exist. 131 raise HttpError({'status': 404}, None, None) 132 if download is None: 133 if f in self._fail_when_getting_metadata: 134 raise HttpError({'status': 429}, None, None) 135 return f.get_metadata() 136 else: 137 if f in self._fail_when_reading: 138 raise HttpError({'status': 429}, None, None) 139 stream = download.stream 140 141 def get_range_callback(start, end): 142 if not 0 <= start <= end < len(f.contents): 143 raise ValueError( 144 'start=%d end=%d len=%s' % (start, end, len(f.contents))) 145 stream.write(f.contents[start:end + 1]) 146 147 download.GetRange = get_range_callback 148 149 def Insert(self, insert_request, upload=None): # pylint: disable=invalid-name 150 assert upload is not None 151 generation = self.get_last_generation( 152 insert_request.bucket, insert_request.name) + 1 153 f = FakeFile(insert_request.bucket, insert_request.name, b'', generation) 154 155 # Stream data into file. 156 stream = upload.stream 157 data_list = [] 158 while True: 159 data = stream.read(1024 * 1024) 160 if not data: 161 break 162 data_list.append(data) 163 f.contents = b''.join(data_list) 164 165 self.add_file(f) 166 167 REWRITE_TOKEN = 'test_token' 168 169 def Rewrite(self, rewrite_request): # pylint: disable=invalid-name 170 if rewrite_request.rewriteToken == self.REWRITE_TOKEN: 171 dest_object = storage.Object() 172 return storage.RewriteResponse( 173 done=True, 174 objectSize=100, 175 resource=dest_object, 176 totalBytesRewritten=100) 177 178 src_file = self.get_file( 179 rewrite_request.sourceBucket, rewrite_request.sourceObject) 180 if not src_file: 181 raise HttpError( 182 httplib2.Response({'status': '404'}), 183 '404 Not Found', 184 'https://fake/url') 185 generation = self.get_last_generation( 186 rewrite_request.destinationBucket, 187 rewrite_request.destinationObject) + 1 188 dest_file = FakeFile( 189 rewrite_request.destinationBucket, 190 rewrite_request.destinationObject, 191 src_file.contents, 192 generation) 193 self.add_file(dest_file) 194 time.sleep(10) # time.sleep and time.time are mocked below. 195 return storage.RewriteResponse( 196 done=False, 197 objectSize=100, 198 rewriteToken=self.REWRITE_TOKEN, 199 totalBytesRewritten=5) 200 201 def Delete(self, delete_request): # pylint: disable=invalid-name 202 # Here, we emulate the behavior of the GCS service in raising a 404 error 203 # if this object already exists. 204 if self.get_file(delete_request.bucket, delete_request.object): 205 self.delete_file(delete_request.bucket, delete_request.object) 206 else: 207 raise HttpError( 208 httplib2.Response({'status': '404'}), 209 '404 Not Found', 210 'https://fake/url') 211 212 def List(self, list_request): # pylint: disable=invalid-name 213 bucket = list_request.bucket 214 prefix = list_request.prefix or '' 215 matching_files = [] 216 for file_bucket, file_name in sorted(iter(self.files)): 217 if bucket == file_bucket and file_name.startswith(prefix): 218 file_object = self.files[(file_bucket, file_name)].get_metadata() 219 matching_files.append(file_object) 220 221 # Handle pagination. 222 items_per_page = 5 223 if not list_request.pageToken: 224 range_start = 0 225 else: 226 if list_request.pageToken not in self.list_page_tokens: 227 raise ValueError('Invalid page token.') 228 range_start = self.list_page_tokens[list_request.pageToken] 229 del self.list_page_tokens[list_request.pageToken] 230 231 result = storage.Objects( 232 items=matching_files[range_start:range_start + items_per_page]) 233 if range_start + items_per_page < len(matching_files): 234 next_range_start = range_start + items_per_page 235 next_page_token = '_page_token_%s_%s_%d' % ( 236 bucket, prefix, next_range_start) 237 self.list_page_tokens[next_page_token] = next_range_start 238 result.nextPageToken = next_page_token 239 return result 240 241 242 class FakeApiCall(object): 243 def __init__(self, exception, response): 244 self.exception = exception 245 self.is_error = exception is not None 246 # Response for Rewrite: 247 self.response = response 248 249 250 class FakeBatchApiRequest(object): 251 def __init__(self, **unused_kwargs): 252 self.operations = [] 253 254 def Add(self, service, method, request): # pylint: disable=invalid-name 255 self.operations.append((service, method, request)) 256 257 def Execute(self, unused_http, **unused_kwargs): # pylint: disable=invalid-name 258 api_calls = [] 259 for service, method, request in self.operations: 260 exception = None 261 response = None 262 try: 263 response = getattr(service, method)(request) 264 except Exception as e: # pylint: disable=broad-except 265 exception = e 266 api_calls.append(FakeApiCall(exception, response)) 267 return api_calls 268 269 270 @unittest.skipIf(HttpError is None, 'GCP dependencies are not installed') 271 class TestGCSPathParser(unittest.TestCase): 272 273 BAD_GCS_PATHS = [ 274 'gs://', 275 'gs://bucket', 276 'gs:///name', 277 'gs:///', 278 'gs:/blah/bucket/name', 279 ] 280 281 def test_gcs_path(self): 282 self.assertEqual( 283 gcsio.parse_gcs_path('gs://bucket/name'), ('bucket', 'name')) 284 self.assertEqual( 285 gcsio.parse_gcs_path('gs://bucket/name/sub'), ('bucket', 'name/sub')) 286 287 def test_bad_gcs_path(self): 288 for path in self.BAD_GCS_PATHS: 289 self.assertRaises(ValueError, gcsio.parse_gcs_path, path) 290 self.assertRaises(ValueError, gcsio.parse_gcs_path, 'gs://bucket/') 291 292 def test_gcs_path_object_optional(self): 293 self.assertEqual( 294 gcsio.parse_gcs_path('gs://bucket/name', object_optional=True), 295 ('bucket', 'name')) 296 self.assertEqual( 297 gcsio.parse_gcs_path('gs://bucket/', object_optional=True), 298 ('bucket', '')) 299 300 def test_bad_gcs_path_object_optional(self): 301 for path in self.BAD_GCS_PATHS: 302 self.assertRaises(ValueError, gcsio.parse_gcs_path, path, True) 303 304 305 class SampleOptions(object): 306 def __init__(self, project, region, kms_key=None): 307 self.project = DEFAULT_GCP_PROJECT 308 self.region = region 309 self.dataflow_kms_key = kms_key 310 311 312 @unittest.skipIf(HttpError is None, 'GCP dependencies are not installed') 313 @mock.patch.multiple( 314 'time', time=mock.MagicMock(side_effect=range(100)), sleep=mock.MagicMock()) 315 class TestGCSIO(unittest.TestCase): 316 def _insert_random_file( 317 self, 318 client, 319 path, 320 size, 321 generation=1, 322 crc32c=None, 323 last_updated=None, 324 fail_when_getting_metadata=False, 325 fail_when_reading=False): 326 bucket, name = gcsio.parse_gcs_path(path) 327 f = FakeFile( 328 bucket, 329 name, 330 os.urandom(size), 331 generation, 332 crc32c=crc32c, 333 last_updated=last_updated) 334 client.objects.add_file(f, fail_when_getting_metadata, fail_when_reading) 335 return f 336 337 def setUp(self): 338 self.client = FakeGcsClient() 339 self.gcs = gcsio.GcsIO(self.client) 340 341 def test_default_bucket_name(self): 342 self.assertEqual( 343 gcsio.default_gcs_bucket_name(DEFAULT_GCP_PROJECT, "us-central1"), 344 'dataflow-staging-us-central1-77b801c0838aee13391c0d1885860494') 345 346 def test_default_bucket_name_failure(self): 347 self.assertEqual( 348 gcsio.get_or_create_default_gcs_bucket( 349 SampleOptions( 350 DEFAULT_GCP_PROJECT, "us-central1", kms_key="kmskey!")), 351 None) 352 353 def test_num_retries(self): 354 # BEAM-7424: update num_retries accordingly if storage_client is 355 # regenerated. 356 self.assertEqual(gcsio.GcsIO().client.num_retries, 20) 357 358 def test_retry_func(self): 359 # BEAM-7667: update retry_func accordingly if storage_client is 360 # regenerated. 361 self.assertIsNotNone(gcsio.GcsIO().client.retry_func) 362 363 def test_exists(self): 364 file_name = 'gs://gcsio-test/dummy_file' 365 file_size = 1234 366 self._insert_random_file(self.client, file_name, file_size) 367 self.assertFalse(self.gcs.exists(file_name + 'xyz')) 368 self.assertTrue(self.gcs.exists(file_name)) 369 370 @mock.patch.object(FakeGcsObjects, 'Get') 371 def test_exists_failure(self, mock_get): 372 # Raising an error other than 404. Raising 404 is a valid failure for 373 # exists() call. 374 mock_get.side_effect = HttpError({'status': 400}, None, None) 375 file_name = 'gs://gcsio-test/dummy_file' 376 file_size = 1234 377 self._insert_random_file(self.client, file_name, file_size) 378 with self.assertRaises(HttpError) as cm: 379 self.gcs.exists(file_name) 380 self.assertEqual(400, cm.exception.status_code) 381 382 def test_checksum(self): 383 file_name = 'gs://gcsio-test/dummy_file' 384 file_size = 1234 385 checksum = 'deadbeef' 386 self._insert_random_file(self.client, file_name, file_size, crc32c=checksum) 387 self.assertTrue(self.gcs.exists(file_name)) 388 self.assertEqual(checksum, self.gcs.checksum(file_name)) 389 390 def test_size(self): 391 file_name = 'gs://gcsio-test/dummy_file' 392 file_size = 1234 393 394 self._insert_random_file(self.client, file_name, file_size) 395 self.assertTrue(self.gcs.exists(file_name)) 396 self.assertEqual(1234, self.gcs.size(file_name)) 397 398 def test_last_updated(self): 399 file_name = 'gs://gcsio-test/dummy_file' 400 file_size = 1234 401 last_updated = 123456.78 402 403 self._insert_random_file( 404 self.client, file_name, file_size, last_updated=last_updated) 405 self.assertTrue(self.gcs.exists(file_name)) 406 self.assertEqual(last_updated, self.gcs.last_updated(file_name)) 407 408 def test_file_status(self): 409 file_name = 'gs://gcsio-test/dummy_file' 410 file_size = 1234 411 last_updated = 123456.78 412 checksum = 'deadbeef' 413 414 self._insert_random_file( 415 self.client, 416 file_name, 417 file_size, 418 last_updated=last_updated, 419 crc32c=checksum) 420 file_checksum = self.gcs.checksum(file_name) 421 422 file_status = self.gcs._status(file_name) 423 424 self.assertEqual(file_status['size'], file_size) 425 self.assertEqual(file_status['checksum'], file_checksum) 426 self.assertEqual(file_status['last_updated'], last_updated) 427 428 def test_file_mode(self): 429 file_name = 'gs://gcsio-test/dummy_mode_file' 430 with self.gcs.open(file_name, 'wb') as f: 431 assert f.mode == 'wb' 432 with self.gcs.open(file_name, 'rb') as f: 433 assert f.mode == 'rb' 434 435 def test_bad_file_modes(self): 436 file_name = 'gs://gcsio-test/dummy_mode_file' 437 with self.assertRaises(ValueError): 438 self.gcs.open(file_name, 'w+') 439 with self.assertRaises(ValueError): 440 self.gcs.open(file_name, 'r+b') 441 442 def test_empty_batches(self): 443 self.assertEqual([], self.gcs.copy_batch([])) 444 self.assertEqual([], self.gcs.delete_batch([])) 445 446 def test_delete(self): 447 file_name = 'gs://gcsio-test/delete_me' 448 file_size = 1024 449 450 # Test deletion of non-existent file. 451 self.gcs.delete(file_name) 452 453 self._insert_random_file(self.client, file_name, file_size) 454 self.assertTrue( 455 gcsio.parse_gcs_path(file_name) in self.client.objects.files) 456 457 self.gcs.delete(file_name) 458 459 self.assertFalse( 460 gcsio.parse_gcs_path(file_name) in self.client.objects.files) 461 462 @mock.patch( 463 'apache_beam.io.gcp.gcsio.auth.get_service_credentials', 464 wraps=lambda pipeline_options: None) 465 @mock.patch('apache_beam.io.gcp.gcsio.get_new_http') 466 def test_user_agent_passed(self, get_new_http_mock, get_service_creds_mock): 467 client = gcsio.GcsIO() 468 try: 469 client.get_bucket('mabucket') 470 except: # pylint: disable=bare-except 471 # Ignore errors. The errors come from the fact that we did not mock 472 # the response from the API, so the overall get_bucket call fails 473 # soon after the GCS API is called. 474 pass 475 call = get_new_http_mock.return_value.request.mock_calls[-2] 476 self.assertIn('apache-beam-', call[2]['headers']['User-Agent']) 477 478 @mock.patch('apache_beam.io.gcp.gcsio.BatchApiRequest') 479 def test_delete_batch(self, *unused_args): 480 gcsio.BatchApiRequest = FakeBatchApiRequest 481 file_name_pattern = 'gs://gcsio-test/delete_me_%d' 482 file_size = 1024 483 num_files = 10 484 485 # Test deletion of non-existent files. 486 result = self.gcs.delete_batch( 487 [file_name_pattern % i for i in range(num_files)]) 488 self.assertTrue(result) 489 for i, (file_name, exception) in enumerate(result): 490 self.assertEqual(file_name, file_name_pattern % i) 491 self.assertEqual(exception, None) 492 self.assertFalse(self.gcs.exists(file_name_pattern % i)) 493 494 # Insert some files. 495 for i in range(num_files): 496 self._insert_random_file(self.client, file_name_pattern % i, file_size) 497 498 # Check files inserted properly. 499 for i in range(num_files): 500 self.assertTrue(self.gcs.exists(file_name_pattern % i)) 501 502 # Execute batch delete. 503 self.gcs.delete_batch([file_name_pattern % i for i in range(num_files)]) 504 505 # Check files deleted properly. 506 for i in range(num_files): 507 self.assertFalse(self.gcs.exists(file_name_pattern % i)) 508 509 def test_copy(self): 510 src_file_name = 'gs://gcsio-test/source' 511 dest_file_name = 'gs://gcsio-test/dest' 512 file_size = 1024 513 self._insert_random_file(self.client, src_file_name, file_size) 514 self.assertTrue( 515 gcsio.parse_gcs_path(src_file_name) in self.client.objects.files) 516 self.assertFalse( 517 gcsio.parse_gcs_path(dest_file_name) in self.client.objects.files) 518 519 self.gcs.copy(src_file_name, dest_file_name, dest_kms_key_name='kms_key') 520 521 self.assertTrue( 522 gcsio.parse_gcs_path(src_file_name) in self.client.objects.files) 523 self.assertTrue( 524 gcsio.parse_gcs_path(dest_file_name) in self.client.objects.files) 525 526 # Test copy of non-existent files. 527 with self.assertRaisesRegex(HttpError, r'Not Found'): 528 self.gcs.copy( 529 'gs://gcsio-test/non-existent', 530 'gs://gcsio-test/non-existent-destination') 531 532 @mock.patch('apache_beam.io.gcp.gcsio.BatchApiRequest') 533 def test_copy_batch(self, *unused_args): 534 gcsio.BatchApiRequest = FakeBatchApiRequest 535 from_name_pattern = 'gs://gcsio-test/copy_me_%d' 536 to_name_pattern = 'gs://gcsio-test/destination_%d' 537 file_size = 1024 538 num_files = 10 539 540 result = self.gcs.copy_batch([(from_name_pattern % i, to_name_pattern % i) 541 for i in range(num_files)], 542 dest_kms_key_name='kms_key') 543 self.assertTrue(result) 544 for i, (src, dest, exception) in enumerate(result): 545 self.assertEqual(src, from_name_pattern % i) 546 self.assertEqual(dest, to_name_pattern % i) 547 self.assertTrue(isinstance(exception, IOError)) 548 self.assertEqual(exception.errno, errno.ENOENT) 549 self.assertFalse(self.gcs.exists(from_name_pattern % i)) 550 self.assertFalse(self.gcs.exists(to_name_pattern % i)) 551 552 # Insert some files. 553 for i in range(num_files): 554 self._insert_random_file(self.client, from_name_pattern % i, file_size) 555 556 # Check files inserted properly. 557 for i in range(num_files): 558 self.assertTrue(self.gcs.exists(from_name_pattern % i)) 559 560 # Execute batch copy. 561 self.gcs.copy_batch([(from_name_pattern % i, to_name_pattern % i) 562 for i in range(num_files)]) 563 564 # Check files copied properly. 565 for i in range(num_files): 566 self.assertTrue(self.gcs.exists(from_name_pattern % i)) 567 self.assertTrue(self.gcs.exists(to_name_pattern % i)) 568 569 def test_copytree(self): 570 src_dir_name = 'gs://gcsio-test/source/' 571 dest_dir_name = 'gs://gcsio-test/dest/' 572 file_size = 1024 573 paths = ['a', 'b/c', 'b/d'] 574 for path in paths: 575 src_file_name = src_dir_name + path 576 dest_file_name = dest_dir_name + path 577 self._insert_random_file(self.client, src_file_name, file_size) 578 self.assertTrue( 579 gcsio.parse_gcs_path(src_file_name) in self.client.objects.files) 580 self.assertFalse( 581 gcsio.parse_gcs_path(dest_file_name) in self.client.objects.files) 582 583 self.gcs.copytree(src_dir_name, dest_dir_name) 584 585 for path in paths: 586 src_file_name = src_dir_name + path 587 dest_file_name = dest_dir_name + path 588 self.assertTrue( 589 gcsio.parse_gcs_path(src_file_name) in self.client.objects.files) 590 self.assertTrue( 591 gcsio.parse_gcs_path(dest_file_name) in self.client.objects.files) 592 593 def test_rename(self): 594 src_file_name = 'gs://gcsio-test/source' 595 dest_file_name = 'gs://gcsio-test/dest' 596 file_size = 1024 597 self._insert_random_file(self.client, src_file_name, file_size) 598 self.assertTrue( 599 gcsio.parse_gcs_path(src_file_name) in self.client.objects.files) 600 self.assertFalse( 601 gcsio.parse_gcs_path(dest_file_name) in self.client.objects.files) 602 603 self.gcs.rename(src_file_name, dest_file_name) 604 605 self.assertFalse( 606 gcsio.parse_gcs_path(src_file_name) in self.client.objects.files) 607 self.assertTrue( 608 gcsio.parse_gcs_path(dest_file_name) in self.client.objects.files) 609 610 def test_full_file_read(self): 611 file_name = 'gs://gcsio-test/full_file' 612 file_size = 5 * 1024 * 1024 + 100 613 random_file = self._insert_random_file(self.client, file_name, file_size) 614 f = self.gcs.open(file_name) 615 self.assertEqual(f.mode, 'r') 616 f.seek(0, os.SEEK_END) 617 self.assertEqual(f.tell(), file_size) 618 self.assertEqual(f.read(), b'') 619 f.seek(0) 620 self.assertEqual(f.read(), random_file.contents) 621 622 def test_file_random_seek(self): 623 file_name = 'gs://gcsio-test/seek_file' 624 file_size = 5 * 1024 * 1024 - 100 625 random_file = self._insert_random_file(self.client, file_name, file_size) 626 627 f = self.gcs.open(file_name) 628 random.seed(0) 629 for _ in range(0, 10): 630 a = random.randint(0, file_size - 1) 631 b = random.randint(0, file_size - 1) 632 start, end = min(a, b), max(a, b) 633 f.seek(start) 634 self.assertEqual(f.tell(), start) 635 self.assertEqual( 636 f.read(end - start + 1), random_file.contents[start:end + 1]) 637 self.assertEqual(f.tell(), end + 1) 638 639 def test_file_iterator(self): 640 file_name = 'gs://gcsio-test/iterating_file' 641 lines = [] 642 line_count = 10 643 for _ in range(line_count): 644 line_length = random.randint(100, 500) 645 line = os.urandom(line_length).replace(b'\n', b' ') + b'\n' 646 lines.append(line) 647 648 contents = b''.join(lines) 649 bucket, name = gcsio.parse_gcs_path(file_name) 650 self.client.objects.add_file(FakeFile(bucket, name, contents, 1)) 651 652 f = self.gcs.open(file_name) 653 654 read_lines = 0 655 for line in f: 656 read_lines += 1 657 658 self.assertEqual(read_lines, line_count) 659 660 def test_file_read_line(self): 661 file_name = 'gs://gcsio-test/read_line_file' 662 lines = [] 663 664 # Set a small buffer size to exercise refilling the buffer. 665 # First line is carefully crafted so the newline falls as the last character 666 # of the buffer to exercise this code path. 667 read_buffer_size = 1024 668 lines.append(b'x' * 1023 + b'\n') 669 670 for _ in range(1, 1000): 671 line_length = random.randint(100, 500) 672 line = os.urandom(line_length).replace(b'\n', b' ') + b'\n' 673 lines.append(line) 674 contents = b''.join(lines) 675 676 file_size = len(contents) 677 bucket, name = gcsio.parse_gcs_path(file_name) 678 self.client.objects.add_file(FakeFile(bucket, name, contents, 1)) 679 680 f = self.gcs.open(file_name, read_buffer_size=read_buffer_size) 681 682 # Test read of first two lines. 683 f.seek(0) 684 self.assertEqual(f.readline(), lines[0]) 685 self.assertEqual(f.tell(), len(lines[0])) 686 self.assertEqual(f.readline(), lines[1]) 687 688 # Test read at line boundary. 689 f.seek(file_size - len(lines[-1]) - 1) 690 self.assertEqual(f.readline(), b'\n') 691 692 # Test read at end of file. 693 f.seek(file_size) 694 self.assertEqual(f.readline(), b'') 695 696 # Test reads at random positions. 697 random.seed(0) 698 for _ in range(0, 10): 699 start = random.randint(0, file_size - 1) 700 line_index = 0 701 # Find line corresponding to start index. 702 chars_left = start 703 while True: 704 next_line_length = len(lines[line_index]) 705 if chars_left - next_line_length < 0: 706 break 707 chars_left -= next_line_length 708 line_index += 1 709 f.seek(start) 710 self.assertEqual(f.readline(), lines[line_index][chars_left:]) 711 712 def test_file_write(self): 713 file_name = 'gs://gcsio-test/write_file' 714 file_size = 5 * 1024 * 1024 + 2000 715 contents = os.urandom(file_size) 716 f = self.gcs.open(file_name, 'w') 717 self.assertEqual(f.mode, 'w') 718 f.write(contents[0:1000]) 719 f.write(contents[1000:1024 * 1024]) 720 f.write(contents[1024 * 1024:]) 721 f.close() 722 bucket, name = gcsio.parse_gcs_path(file_name) 723 self.assertEqual( 724 self.client.objects.get_file(bucket, name).contents, contents) 725 726 def test_file_close(self): 727 file_name = 'gs://gcsio-test/close_file' 728 file_size = 5 * 1024 * 1024 + 2000 729 contents = os.urandom(file_size) 730 f = self.gcs.open(file_name, 'w') 731 self.assertEqual(f.mode, 'w') 732 f.write(contents) 733 f.close() 734 f.close() # This should not crash. 735 bucket, name = gcsio.parse_gcs_path(file_name) 736 self.assertEqual( 737 self.client.objects.get_file(bucket, name).contents, contents) 738 739 def test_file_flush(self): 740 file_name = 'gs://gcsio-test/flush_file' 741 file_size = 5 * 1024 * 1024 + 2000 742 contents = os.urandom(file_size) 743 bucket, name = gcsio.parse_gcs_path(file_name) 744 f = self.gcs.open(file_name, 'w') 745 self.assertEqual(f.mode, 'w') 746 f.write(contents[0:1000]) 747 f.flush() 748 f.write(contents[1000:1024 * 1024]) 749 f.flush() 750 f.flush() # Should be a NOOP. 751 f.write(contents[1024 * 1024:]) 752 f.close() # This should already call the equivalent of flush() in its body. 753 self.assertEqual( 754 self.client.objects.get_file(bucket, name).contents, contents) 755 756 def test_context_manager(self): 757 # Test writing with a context manager. 758 file_name = 'gs://gcsio-test/context_manager_file' 759 file_size = 1024 760 contents = os.urandom(file_size) 761 with self.gcs.open(file_name, 'w') as f: 762 f.write(contents) 763 bucket, name = gcsio.parse_gcs_path(file_name) 764 self.assertEqual( 765 self.client.objects.get_file(bucket, name).contents, contents) 766 767 # Test reading with a context manager. 768 with self.gcs.open(file_name) as f: 769 self.assertEqual(f.read(), contents) 770 771 # Test that exceptions are not swallowed by the context manager. 772 with self.assertRaises(ZeroDivisionError): 773 with self.gcs.open(file_name) as f: 774 f.read(0 // 0) 775 776 def test_list_prefix(self): 777 bucket_name = 'gcsio-test' 778 objects = [ 779 ('cow/cat/fish', 2), 780 ('cow/cat/blubber', 3), 781 ('cow/dog/blubber', 4), 782 ] 783 for (object_name, size) in objects: 784 file_name = 'gs://%s/%s' % (bucket_name, object_name) 785 self._insert_random_file(self.client, file_name, size) 786 test_cases = [ 787 ( 788 'gs://gcsio-test/c', 789 [ 790 ('cow/cat/fish', 2), 791 ('cow/cat/blubber', 3), 792 ('cow/dog/blubber', 4), 793 ]), 794 ( 795 'gs://gcsio-test/cow/', 796 [ 797 ('cow/cat/fish', 2), 798 ('cow/cat/blubber', 3), 799 ('cow/dog/blubber', 4), 800 ]), 801 ('gs://gcsio-test/cow/cat/fish', [ 802 ('cow/cat/fish', 2), 803 ]), 804 ] 805 for file_pattern, expected_object_names in test_cases: 806 expected_file_names = [('gs://%s/%s' % (bucket_name, object_name), size) 807 for (object_name, size) in expected_object_names] 808 self.assertEqual( 809 set(self.gcs.list_prefix(file_pattern).items()), 810 set(expected_file_names)) 811 812 def test_mime_binary_encoding(self): 813 # This test verifies that the MIME email_generator library works properly 814 # and does not corrupt '\r\n' during uploads (the patch to apitools in 815 # Python 3 is applied in io/gcp/__init__.py). 816 from apitools.base.py.transfer import email_generator 817 generator_cls = email_generator.BytesGenerator 818 output_buffer = io.BytesIO() 819 generator = generator_cls(output_buffer) 820 test_msg = 'a\nb\r\nc\n\r\n\n\nd' 821 message = Message() 822 message.set_payload(test_msg) 823 generator._handle_text(message) 824 self.assertEqual(test_msg.encode('ascii'), output_buffer.getvalue()) 825 826 def test_downloader_monitoring_info(self): 827 # Clear the process wide metric container. 828 MetricsEnvironment.process_wide_container().reset() 829 830 file_name = 'gs://gcsio-metrics-test/dummy_mode_file' 831 file_size = 5 * 1024 * 1024 + 100 832 random_file = self._insert_random_file(self.client, file_name, file_size) 833 self.gcs.open(file_name, 'r') 834 835 resource = resource_identifiers.GoogleCloudStorageBucket(random_file.bucket) 836 labels = { 837 monitoring_infos.SERVICE_LABEL: 'Storage', 838 monitoring_infos.METHOD_LABEL: 'Objects.get', 839 monitoring_infos.RESOURCE_LABEL: resource, 840 monitoring_infos.GCS_BUCKET_LABEL: random_file.bucket, 841 monitoring_infos.GCS_PROJECT_ID_LABEL: str(DEFAULT_PROJECT_NUMBER), 842 monitoring_infos.STATUS_LABEL: 'ok' 843 } 844 845 metric_name = MetricName( 846 None, None, urn=monitoring_infos.API_REQUEST_COUNT_URN, labels=labels) 847 metric_value = MetricsEnvironment.process_wide_container().get_counter( 848 metric_name).get_cumulative() 849 850 self.assertEqual(metric_value, 2) 851 852 @mock.patch.object(FakeGcsBuckets, 'Get') 853 def test_downloader_fail_to_get_project_number(self, mock_get): 854 # Raising an error when listing GCS Bucket so that project number fails to 855 # be retrieved. 856 mock_get.side_effect = HttpError({'status': 403}, None, None) 857 # Clear the process wide metric container. 858 MetricsEnvironment.process_wide_container().reset() 859 860 file_name = 'gs://gcsio-metrics-test/dummy_mode_file' 861 file_size = 5 * 1024 * 1024 + 100 862 random_file = self._insert_random_file(self.client, file_name, file_size) 863 self.gcs.open(file_name, 'r') 864 865 resource = resource_identifiers.GoogleCloudStorageBucket(random_file.bucket) 866 labels = { 867 monitoring_infos.SERVICE_LABEL: 'Storage', 868 monitoring_infos.METHOD_LABEL: 'Objects.get', 869 monitoring_infos.RESOURCE_LABEL: resource, 870 monitoring_infos.GCS_BUCKET_LABEL: random_file.bucket, 871 monitoring_infos.GCS_PROJECT_ID_LABEL: str(DEFAULT_PROJECT_NUMBER), 872 monitoring_infos.STATUS_LABEL: 'ok' 873 } 874 875 metric_name = MetricName( 876 None, None, urn=monitoring_infos.API_REQUEST_COUNT_URN, labels=labels) 877 metric_value = MetricsEnvironment.process_wide_container().get_counter( 878 metric_name).get_cumulative() 879 880 self.assertEqual(metric_value, 0) 881 882 labels_without_project_id = { 883 monitoring_infos.SERVICE_LABEL: 'Storage', 884 monitoring_infos.METHOD_LABEL: 'Objects.get', 885 monitoring_infos.RESOURCE_LABEL: resource, 886 monitoring_infos.GCS_BUCKET_LABEL: random_file.bucket, 887 monitoring_infos.STATUS_LABEL: 'ok' 888 } 889 metric_name = MetricName( 890 None, 891 None, 892 urn=monitoring_infos.API_REQUEST_COUNT_URN, 893 labels=labels_without_project_id) 894 metric_value = MetricsEnvironment.process_wide_container().get_counter( 895 metric_name).get_cumulative() 896 897 self.assertEqual(metric_value, 2) 898 899 def test_downloader_fail_non_existent_object(self): 900 file_name = 'gs://gcsio-metrics-test/dummy_mode_file' 901 with self.assertRaises(IOError): 902 self.gcs.open(file_name, 'r') 903 904 def test_downloader_fail_when_getting_metadata(self): 905 file_name = 'gs://gcsio-metrics-test/dummy_mode_file' 906 file_size = 5 * 1024 * 1024 + 100 907 self._insert_random_file( 908 self.client, file_name, file_size, fail_when_getting_metadata=True) 909 with self.assertRaises(HttpError): 910 self.gcs.open(file_name, 'r') 911 912 def test_downloader_fail_when_reading(self): 913 file_name = 'gs://gcsio-metrics-test/dummy_mode_file' 914 file_size = 5 * 1024 * 1024 + 100 915 self._insert_random_file( 916 self.client, file_name, file_size, fail_when_reading=True) 917 with self.assertRaises(HttpError): 918 self.gcs.open(file_name, 'r') 919 920 def test_uploader_monitoring_info(self): 921 # Clear the process wide metric container. 922 MetricsEnvironment.process_wide_container().reset() 923 924 file_name = 'gs://gcsio-metrics-test/dummy_mode_file' 925 file_size = 5 * 1024 * 1024 + 100 926 random_file = self._insert_random_file(self.client, file_name, file_size) 927 f = self.gcs.open(file_name, 'w') 928 929 resource = resource_identifiers.GoogleCloudStorageBucket(random_file.bucket) 930 labels = { 931 monitoring_infos.SERVICE_LABEL: 'Storage', 932 monitoring_infos.METHOD_LABEL: 'Objects.insert', 933 monitoring_infos.RESOURCE_LABEL: resource, 934 monitoring_infos.GCS_BUCKET_LABEL: random_file.bucket, 935 monitoring_infos.GCS_PROJECT_ID_LABEL: str(DEFAULT_PROJECT_NUMBER), 936 monitoring_infos.STATUS_LABEL: 'ok' 937 } 938 939 f.close() 940 metric_name = MetricName( 941 None, None, urn=monitoring_infos.API_REQUEST_COUNT_URN, labels=labels) 942 metric_value = MetricsEnvironment.process_wide_container().get_counter( 943 metric_name).get_cumulative() 944 945 self.assertEqual(metric_value, 1) 946 947 948 if __name__ == '__main__': 949 logging.getLogger().setLevel(logging.INFO) 950 unittest.main()