github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/aws/s3io_test.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Tests for S3 client.""" 19 # pytype: skip-file 20 21 import logging 22 import os 23 import random 24 import time 25 import unittest 26 27 from apache_beam.io.aws import s3io 28 from apache_beam.io.aws.clients.s3 import fake_client 29 from apache_beam.io.aws.clients.s3 import messages 30 from apache_beam.options import pipeline_options 31 32 33 class TestS3PathParser(unittest.TestCase): 34 35 BAD_S3_PATHS = [ 36 's3://', 37 's3://bucket', 38 's3:///name', 39 's3:///', 40 's3:/blah/bucket/name', 41 ] 42 43 def test_s3_path(self): 44 self.assertEqual(s3io.parse_s3_path('s3://bucket/name'), ('bucket', 'name')) 45 self.assertEqual( 46 s3io.parse_s3_path('s3://bucket/name/sub'), ('bucket', 'name/sub')) 47 48 def test_bad_s3_path(self): 49 for path in self.BAD_S3_PATHS: 50 self.assertRaises(ValueError, s3io.parse_s3_path, path) 51 self.assertRaises(ValueError, s3io.parse_s3_path, 's3://bucket/') 52 53 def test_s3_path_object_optional(self): 54 self.assertEqual( 55 s3io.parse_s3_path('s3://bucket/name', object_optional=True), 56 ('bucket', 'name')) 57 self.assertEqual( 58 s3io.parse_s3_path('s3://bucket/', object_optional=True), 59 ('bucket', '')) 60 61 def test_bad_s3_path_object_optional(self): 62 for path in self.BAD_S3_PATHS: 63 self.assertRaises(ValueError, s3io.parse_s3_path, path, True) 64 65 66 class TestS3IO(unittest.TestCase): 67 def _insert_random_file(self, client, path, size): 68 bucket, name = s3io.parse_s3_path(path) 69 contents = os.urandom(size) 70 fakeFile = fake_client.FakeFile(bucket, name, contents) 71 72 if self.USE_MOCK: 73 self.client.add_file(fakeFile) 74 75 else: 76 f = self.aws.open(path, 'w') 77 f.write(contents) 78 f.close() 79 80 return fakeFile 81 82 def setUp(self): 83 84 # These tests can be run locally against a mock S3 client, or as integration 85 # tests against the real S3 client. 86 self.USE_MOCK = True 87 88 # If you're running integration tests with S3, set this variable to be an 89 # s3 path that you have access to where test data can be written. If you're 90 # just running tests against the mock, this can be any s3 path. It should 91 # end with a '/'. 92 self.TEST_DATA_PATH = 's3://random-data-sets/beam_tests/' 93 94 if self.USE_MOCK: 95 self.client = fake_client.FakeS3Client() 96 test_data_bucket, _ = s3io.parse_s3_path(self.TEST_DATA_PATH) 97 self.client.known_buckets.add(test_data_bucket) 98 self.aws = s3io.S3IO(self.client) 99 100 else: 101 self.aws = s3io.S3IO(options=pipeline_options.S3Options()) 102 self.client = self.aws.client 103 104 def test_size(self): 105 file_name = self.TEST_DATA_PATH + 'dummy_file' 106 file_size = 1234 107 108 self._insert_random_file(self.client, file_name, file_size) 109 self.assertTrue(self.aws.exists(file_name)) 110 self.assertEqual(1234, self.aws.size(file_name)) 111 112 # Clean up 113 self.aws.delete(file_name) 114 115 def test_last_updated(self): 116 file_name = self.TEST_DATA_PATH + 'dummy_file' 117 file_size = 1234 118 119 self._insert_random_file(self.client, file_name, file_size) 120 self.assertTrue(self.aws.exists(file_name)) 121 # The time difference should be tiny for the mock client. 122 # A loose tolerance is for the consideration of real s3 client. 123 tolerance = 5 * 60 # 5 mins 124 result = self.aws.last_updated(file_name) 125 self.assertAlmostEqual(result, time.time(), delta=tolerance) 126 127 # Clean up 128 self.aws.delete(file_name) 129 130 def test_checksum(self): 131 file_name = self.TEST_DATA_PATH + 'checksum' 132 file_size = 1024 133 file_ = self._insert_random_file(self.client, file_name, file_size) 134 135 original_etag = self.aws.checksum(file_name) 136 137 self.aws.delete(file_name) 138 139 with self.aws.open(file_name, 'w') as f: 140 f.write(file_.contents) 141 142 rewritten_etag = self.aws.checksum(file_name) 143 144 self.assertEqual(original_etag, rewritten_etag) 145 self.assertEqual(len(original_etag), 36) 146 self.assertTrue(original_etag.endswith('-1"')) 147 148 # Clean up 149 self.aws.delete(file_name) 150 151 def test_file_status(self): 152 file_name = self.TEST_DATA_PATH + 'metadata' 153 file_size = 1024 154 self._insert_random_file(self.client, file_name, file_size) 155 file_checksum = self.aws.checksum(file_name) 156 file_timestamp = self.aws.last_updated(file_name) 157 158 file_status = self.aws._status(file_name) 159 160 self.assertEqual(file_status['size'], file_size) 161 self.assertEqual(file_status['checksum'], file_checksum) 162 self.assertEqual(file_status['last_updated'], file_timestamp) 163 164 # Clean up 165 self.aws.delete(file_name) 166 167 def test_copy(self): 168 src_file_name = self.TEST_DATA_PATH + 'source' 169 dest_file_name = self.TEST_DATA_PATH + 'dest' 170 file_size = 1024 171 self._insert_random_file(self.client, src_file_name, file_size) 172 173 self.assertTrue(src_file_name in self.aws.list_prefix(self.TEST_DATA_PATH)) 174 self.assertFalse( 175 dest_file_name in self.aws.list_prefix(self.TEST_DATA_PATH)) 176 177 self.aws.copy(src_file_name, dest_file_name) 178 179 self.assertTrue(src_file_name in self.aws.list_prefix(self.TEST_DATA_PATH)) 180 self.assertTrue(dest_file_name in self.aws.list_prefix(self.TEST_DATA_PATH)) 181 182 # Clean up 183 self.aws.delete_files([src_file_name, dest_file_name]) 184 185 # Test copy of non-existent files. 186 with self.assertRaises(messages.S3ClientError) as err: 187 self.aws.copy( 188 self.TEST_DATA_PATH + 'non-existent', 189 self.TEST_DATA_PATH + 'non-existent-destination') 190 191 self.assertTrue('Not Found' in err.exception.message) 192 193 def test_copy_paths(self): 194 from_name_pattern = self.TEST_DATA_PATH + 'copy_me_%d' 195 to_name_pattern = self.TEST_DATA_PATH + 'destination_%d' 196 file_size = 1024 197 num_files = 10 198 199 src_dest_pairs = [(from_name_pattern % i, to_name_pattern % i) 200 for i in range(num_files)] 201 202 result = self.aws.copy_paths(src_dest_pairs) 203 204 self.assertTrue(result) 205 for i, (src, dest, exception) in enumerate(result): 206 self.assertEqual(src, from_name_pattern % i) 207 self.assertEqual(dest, to_name_pattern % i) 208 self.assertTrue(isinstance(exception, messages.S3ClientError)) 209 self.assertEqual(exception.code, 404) 210 self.assertFalse(self.aws.exists(from_name_pattern % i)) 211 self.assertFalse(self.aws.exists(to_name_pattern % i)) 212 213 # Insert some files. 214 for i in range(num_files): 215 self._insert_random_file(self.client, from_name_pattern % i, file_size) 216 217 # Check files inserted properly. 218 for i in range(num_files): 219 self.assertTrue(self.aws.exists(from_name_pattern % i)) 220 221 # Execute batch copy. 222 result = self.aws.copy_paths(src_dest_pairs) 223 224 # Check files copied properly. 225 for i in range(num_files): 226 self.assertTrue(self.aws.exists(from_name_pattern % i)) 227 self.assertTrue(self.aws.exists(to_name_pattern % i)) 228 229 # Check results 230 for i, (src, dest, exception) in enumerate(result): 231 self.assertEqual(src_dest_pairs[i], (src, dest)) 232 self.assertEqual(exception, None) 233 234 # Clean up 235 all_files = set().union(*[set(pair) for pair in src_dest_pairs]) 236 self.aws.delete_files(all_files) 237 238 def test_copy_paths_error(self): 239 n_real_files = 3 240 241 # Create some files 242 from_path = self.TEST_DATA_PATH + 'copy_paths/' 243 files = [from_path + '%d' % i for i in range(n_real_files)] 244 to_path = self.TEST_DATA_PATH + 'destination/' 245 destinations = [to_path + '%d' % i for i in range(n_real_files)] 246 for file_ in files: 247 self._insert_random_file(self.client, file_, 1024) 248 249 # Add nonexistent files to the sources and destinations 250 sources = files + [ 251 from_path + 'X', 252 from_path + 'fake_directory_1/', 253 from_path + 'fake_directory_2/' 254 ] 255 destinations += [ 256 to_path + 'X', 257 to_path + 'fake_directory_1/', 258 to_path + 'fake_directory_2' 259 ] 260 result = self.aws.copy_paths(list(zip(sources, destinations))) 261 262 # The copy_paths function of class S3IO does not return one single 263 # result when copying a directory. Instead, it returns the results 264 # of copying every file in the source directory. 265 self.assertEqual(len(result), len(sources) - 1) 266 267 for _, _, err in result[:n_real_files]: 268 self.assertTrue(err is None) 269 270 for _, _, err in result[n_real_files:]: 271 self.assertIsInstance(err, messages.S3ClientError) 272 273 # For the same reason of copy_paths function of S3IO above 274 # skip this assert. 275 #self.assertEqual(result[-3][2].code, 404) 276 self.assertEqual(result[-2][2].code, 404) 277 self.assertEqual(result[-1][2].code, 400) 278 279 # Clean up 280 self.aws.delete_files(files) 281 self.aws.delete_files(destinations) 282 283 def test_copy_tree(self): 284 src_dir_name = self.TEST_DATA_PATH + 'source/' 285 dest_dir_name = self.TEST_DATA_PATH + 'dest/' 286 file_size = 1024 287 paths = ['a', 'b/c', 'b/d'] 288 for path in paths: 289 src_file_name = src_dir_name + path 290 dest_file_name = dest_dir_name + path 291 self._insert_random_file(self.client, src_file_name, file_size) 292 self.assertTrue( 293 src_file_name in self.aws.list_prefix(self.TEST_DATA_PATH)) 294 self.assertFalse( 295 dest_file_name in self.aws.list_prefix(self.TEST_DATA_PATH)) 296 297 results = self.aws.copy_tree(src_dir_name, dest_dir_name) 298 299 for src_file_name, dest_file_name, err in results: 300 301 self.assertTrue(src_dir_name in src_file_name) 302 self.assertTrue(dest_dir_name in dest_file_name) 303 self.assertIsNone(err) 304 305 self.assertTrue( 306 src_file_name in self.aws.list_prefix(self.TEST_DATA_PATH)) 307 self.assertTrue( 308 dest_file_name in self.aws.list_prefix(self.TEST_DATA_PATH)) 309 310 # Clean up 311 for path in paths: 312 src_file_name = src_dir_name + path 313 dest_file_name = dest_dir_name + path 314 self.aws.delete_files([src_file_name, dest_file_name]) 315 316 def test_rename(self): 317 src_file_name = self.TEST_DATA_PATH + 'source' 318 dest_file_name = self.TEST_DATA_PATH + 'dest' 319 file_size = 1024 320 321 self._insert_random_file(self.client, src_file_name, file_size) 322 323 self.assertTrue(src_file_name in self.aws.list_prefix(self.TEST_DATA_PATH)) 324 self.assertFalse( 325 dest_file_name in self.aws.list_prefix(self.TEST_DATA_PATH)) 326 327 self.aws.rename(src_file_name, dest_file_name) 328 329 self.assertFalse(src_file_name in self.aws.list_prefix(self.TEST_DATA_PATH)) 330 self.assertTrue(dest_file_name in self.aws.list_prefix(self.TEST_DATA_PATH)) 331 332 # Clean up 333 self.aws.delete_files([src_file_name, dest_file_name]) 334 335 def test_rename_files(self): 336 from_name_pattern = self.TEST_DATA_PATH + 'to_rename_%d' 337 to_name_pattern = self.TEST_DATA_PATH + 'been_renamed_%d' 338 file_size = 1024 339 num_files = 10 340 341 src_dest_pairs = [(from_name_pattern % i, to_name_pattern % i) 342 for i in range(num_files)] 343 344 result = self.aws.rename_files(src_dest_pairs) 345 346 self.assertTrue(result) 347 for i, (src, dest, exception) in enumerate(result): 348 self.assertEqual(src, from_name_pattern % i) 349 self.assertEqual(dest, to_name_pattern % i) 350 self.assertTrue(isinstance(exception, messages.S3ClientError)) 351 self.assertEqual(exception.code, 404) 352 self.assertFalse(self.aws.exists(from_name_pattern % i)) 353 self.assertFalse(self.aws.exists(to_name_pattern % i)) 354 355 # Insert some files. 356 for i in range(num_files): 357 self._insert_random_file(self.client, from_name_pattern % i, file_size) 358 359 # Check files inserted properly. 360 for i in range(num_files): 361 self.assertTrue(self.aws.exists(from_name_pattern % i)) 362 self.assertFalse(self.aws.exists(to_name_pattern % i)) 363 364 # Execute batch rename. 365 self.aws.rename_files(src_dest_pairs) 366 367 # Check files were renamed properly. 368 for i in range(num_files): 369 self.assertFalse(self.aws.exists(from_name_pattern % i)) 370 self.assertTrue(self.aws.exists(to_name_pattern % i)) 371 372 # Clean up 373 all_files = set().union(*[set(pair) for pair in src_dest_pairs]) 374 self.aws.delete_files(all_files) 375 376 def test_rename_files_with_errors(self): 377 real_prefix = self.TEST_DATA_PATH + 'rename_batch_%s' 378 fake_prefix = 's3://fake-bucket-68ae4b0ef7b9/rename_batch_%s' 379 src_dest_pairs = [(prefix % 'src', prefix % 'dest') 380 for prefix in (real_prefix, fake_prefix)] 381 382 # Create the file in the real bucket 383 self._insert_random_file(self.client, real_prefix % 'src', 1024) 384 385 # Execute batch rename 386 result = self.aws.rename_files(src_dest_pairs) 387 388 # First is the file in the real bucket, which shouldn't throw an error 389 self.assertEqual(result[0][0], src_dest_pairs[0][0]) 390 self.assertEqual(result[0][1], src_dest_pairs[0][1]) 391 self.assertIsNone(result[0][2]) 392 393 # Second is the file in the fake bucket, which should throw a 404 394 self.assertEqual(result[1][0], src_dest_pairs[1][0]) 395 self.assertEqual(result[1][1], src_dest_pairs[1][1]) 396 self.assertEqual(result[1][2].code, 404) 397 398 # Clean up 399 self.aws.delete(real_prefix % 'dest') 400 401 def test_rename_files_with_errors_directory(self): 402 403 # Make file 404 dir_name = self.TEST_DATA_PATH + 'rename_dir/' 405 file_name = dir_name + 'file' 406 self._insert_random_file(self.client, file_name, 1024) 407 408 self.assertTrue(self.aws.exists(file_name)) 409 410 with self.assertRaises(ValueError): 411 self.aws.rename_files([(file_name, self.TEST_DATA_PATH + 'dir_dest/')]) 412 413 # Clean up 414 self.aws.delete(file_name) 415 416 def test_delete_paths(self): 417 # Make files 418 prefix = self.TEST_DATA_PATH + 'delete_paths/' 419 file_names = [prefix + 'a', prefix + 'b/c'] 420 for file_name in file_names: 421 self._insert_random_file(self.client, file_name, 1024) 422 423 self.assertTrue(self.aws.exists(file_names[0])) 424 self.assertTrue(self.aws.exists(file_names[1])) 425 426 # Delete paths 427 paths = [prefix + 'a', prefix + 'b/'] 428 self.aws.delete_paths(paths) 429 430 self.assertFalse(self.aws.exists(file_names[0])) 431 self.assertFalse(self.aws.exists(file_names[1])) 432 433 def test_delete(self): 434 file_name = self.TEST_DATA_PATH + 'delete_file' 435 file_size = 1024 436 437 # Test deletion of non-existent file (shouldn't raise any error) 438 self.aws.delete(file_name) 439 440 # Create the file and check that it was created 441 self._insert_random_file(self.aws.client, file_name, file_size) 442 files = self.aws.list_prefix(self.TEST_DATA_PATH) 443 self.assertTrue(file_name in files) 444 445 # Delete the file and check that it was deleted 446 self.aws.delete(file_name) 447 self.assertFalse(self.aws.exists(file_name)) 448 449 def test_delete_files(self, *unused_args): 450 file_name_pattern = self.TEST_DATA_PATH + 'delete_batch/%d' 451 file_size = 1024 452 num_files = 5 453 454 # Test deletion of non-existent files. 455 result = self.aws.delete_files( 456 [file_name_pattern % i for i in range(num_files)]) 457 self.assertTrue(result) 458 for i, (file_name, exception) in enumerate(result): 459 self.assertEqual(file_name, file_name_pattern % i) 460 self.assertEqual(exception, None) 461 self.assertFalse(self.aws.exists(file_name_pattern % i)) 462 463 # Insert some files. 464 for i in range(num_files): 465 self._insert_random_file(self.client, file_name_pattern % i, file_size) 466 467 # Check files inserted properly. 468 for i in range(num_files): 469 self.assertTrue(self.aws.exists(file_name_pattern % i)) 470 471 # Execute batch delete. 472 self.aws.delete_files([file_name_pattern % i for i in range(num_files)]) 473 474 # Check files deleted properly. 475 for i in range(num_files): 476 self.assertFalse(self.aws.exists(file_name_pattern % i)) 477 478 def test_delete_files_with_errors(self, *unused_args): 479 real_file = self.TEST_DATA_PATH + 'delete_batch/file' 480 fake_file = 's3://fake-bucket-68ae4b0ef7b9/delete_batch/file' 481 filenames = [real_file, fake_file] 482 483 result = self.aws.delete_files(filenames) 484 485 # First is the file in the real bucket, which shouldn't throw an error 486 self.assertEqual(result[0][0], filenames[0]) 487 self.assertIsNone(result[0][1]) 488 489 # Second is the file in the fake bucket, which should throw a 404 490 self.assertEqual(result[1][0], filenames[1]) 491 self.assertEqual(result[1][1].code, 404) 492 493 def test_delete_tree(self): 494 495 root_path = self.TEST_DATA_PATH + 'delete_tree/' 496 leaf_paths = ['a', 'b/c', 'b/d', 'b/d/e'] 497 paths = [root_path + leaf for leaf in leaf_paths] 498 499 # Create file tree 500 file_size = 1024 501 for path in paths: 502 self._insert_random_file(self.client, path, file_size) 503 504 # Check that the files exist 505 for path in paths: 506 self.assertTrue(self.aws.exists(path)) 507 508 # Delete the tree 509 self.aws.delete_tree(root_path) 510 511 # Check that the files have been deleted 512 for path in paths: 513 self.assertFalse(self.aws.exists(path)) 514 515 def test_exists(self): 516 file_name = self.TEST_DATA_PATH + 'exists' 517 file_size = 1024 518 519 self.assertFalse(self.aws.exists(file_name)) 520 521 self._insert_random_file(self.aws.client, file_name, file_size) 522 523 self.assertTrue(self.aws.exists(file_name)) 524 525 # Clean up 526 self.aws.delete(file_name) 527 528 self.assertFalse(self.aws.exists(file_name)) 529 530 def test_file_mode(self): 531 file_name = self.TEST_DATA_PATH + 'jerry/pigpen/bobby' 532 with self.aws.open(file_name, 'w') as f: 533 assert f.mode == 'w' 534 with self.aws.open(file_name, 'r') as f: 535 assert f.mode == 'r' 536 537 # Clean up 538 self.aws.delete(file_name) 539 540 def test_full_file_read(self): 541 file_name = self.TEST_DATA_PATH + 'jerry/pigpen/phil' 542 file_size = 1024 543 544 f = self._insert_random_file(self.aws.client, file_name, file_size) 545 contents = f.contents 546 547 f = self.aws.open(file_name) 548 self.assertEqual(f.mode, 'r') 549 f.seek(0, os.SEEK_END) 550 self.assertEqual(f.tell(), file_size) 551 self.assertEqual(f.read(), b'') 552 f.seek(0) 553 self.assertEqual(f.read(), contents) 554 555 # Clean up 556 self.aws.delete(file_name) 557 558 def test_file_write(self): 559 file_name = self.TEST_DATA_PATH + 'write_file' 560 file_size = 8 * 1024 * 1024 + 2000 561 contents = os.urandom(file_size) 562 f = self.aws.open(file_name, 'w') 563 self.assertEqual(f.mode, 'w') 564 f.write(contents[0:1000]) 565 f.write(contents[1000:1024 * 1024]) 566 f.write(contents[1024 * 1024:]) 567 f.close() 568 new_f = self.aws.open(file_name, 'r') 569 new_f_contents = new_f.read() 570 self.assertEqual(new_f_contents, contents) 571 572 # Clean up 573 self.aws.delete(file_name) 574 575 def test_file_mime_type(self): 576 if self.USE_MOCK: 577 self.skipTest("The boto3_client mock doesn't support mime_types") 578 579 mime_type = 'example/example' 580 file_name = self.TEST_DATA_PATH + 'write_file' 581 f = self.aws.open(file_name, 'w', mime_type=mime_type) 582 f.write(b'a string of binary text') 583 f.close() 584 585 bucket, key = s3io.parse_s3_path(file_name) 586 metadata = self.client.get_object_metadata(messages.GetRequest(bucket, key)) 587 588 self.assertEqual(mime_type, metadata.mime_type) 589 590 # Clean up 591 self.aws.delete(file_name) 592 593 def test_file_random_seek(self): 594 file_name = self.TEST_DATA_PATH + 'write_seek_file' 595 file_size = 5 * 1024 * 1024 - 100 596 contents = os.urandom(file_size) 597 with self.aws.open(file_name, 'w') as wf: 598 wf.write(contents) 599 600 f = self.aws.open(file_name) 601 random.seed(0) 602 603 for _ in range(0, 10): 604 a = random.randint(0, file_size - 1) 605 b = random.randint(0, file_size - 1) 606 start, end = min(a, b), max(a, b) 607 f.seek(start) 608 609 self.assertEqual(f.tell(), start) 610 611 self.assertEqual(f.read(end - start + 1), contents[start:end + 1]) 612 self.assertEqual(f.tell(), end + 1) 613 614 # Clean up 615 self.aws.delete(file_name) 616 617 def test_file_flush(self): 618 file_name = self.TEST_DATA_PATH + 'flush_file' 619 file_size = 5 * 1024 * 1024 + 2000 620 contents = os.urandom(file_size) 621 f = self.aws.open(file_name, 'w') 622 self.assertEqual(f.mode, 'w') 623 f.write(contents[0:1000]) 624 f.flush() 625 f.write(contents[1000:1024 * 1024]) 626 f.flush() 627 f.flush() # Should be a NOOP. 628 f.write(contents[1024 * 1024:]) 629 f.close( 630 ) # This should al`read`y call the equivalent of flush() in its body 631 new_f = self.aws.open(file_name, 'r') 632 new_f_contents = new_f.read() 633 self.assertEqual(new_f_contents, contents) 634 635 # Clean up 636 self.aws.delete(file_name) 637 638 def test_file_iterator(self): 639 file_name = self.TEST_DATA_PATH + 'iterate_file' 640 lines = [] 641 line_count = 10 642 for _ in range(line_count): 643 line_length = random.randint(100, 500) 644 line = os.urandom(line_length).replace(b'\n', b' ') + b'\n' 645 lines.append(line) 646 647 contents = b''.join(lines) 648 649 with self.aws.open(file_name, 'w') as wf: 650 wf.write(contents) 651 652 f = self.aws.open(file_name) 653 654 read_lines = 0 655 for line in f: 656 read_lines += 1 657 658 self.assertEqual(read_lines, line_count) 659 660 # Clean up 661 self.aws.delete(file_name) 662 663 def test_file_read_line(self): 664 file_name = self.TEST_DATA_PATH + 'read_line_file' 665 lines = [] 666 667 # Set a small buffer size to exercise refilling the buffer. 668 # First line is carefully crafted so the newline falls as the last character 669 # of the buffer to exercise this code path. 670 read_buffer_size = 1099 671 lines.append(b'x' * 1023 + b'\n') 672 673 for _ in range(1, 1000): 674 line_length = random.randint(100, 500) 675 line = os.urandom(line_length).replace(b'\n', b' ') + b'\n' 676 lines.append(line) 677 contents = b''.join(lines) 678 679 file_size = len(contents) 680 681 with self.aws.open(file_name, 'wb') as wf: 682 wf.write(contents) 683 684 f = self.aws.open(file_name, 'rb', read_buffer_size=read_buffer_size) 685 686 # Test read of first two lines. 687 f.seek(0) 688 self.assertEqual(f.readline(), lines[0]) 689 self.assertEqual(f.tell(), len(lines[0])) 690 self.assertEqual(f.readline(), lines[1]) 691 692 # Test read at line boundary. 693 f.seek(file_size - len(lines[-1]) - 1) 694 self.assertEqual(f.readline(), b'\n') 695 696 # Test read at end of file. 697 f.seek(file_size) 698 self.assertEqual(f.readline(), b'') 699 700 # Test reads at random positions. 701 random.seed(0) 702 for _ in range(0, 10): 703 start = random.randint(0, file_size - 1) 704 line_index = 0 705 # Find line corresponding to start index. 706 chars_left = start 707 while True: 708 next_line_length = len(lines[line_index]) 709 if chars_left - next_line_length < 0: 710 break 711 chars_left -= next_line_length 712 line_index += 1 713 f.seek(start) 714 self.assertEqual(f.readline(), lines[line_index][chars_left:]) 715 716 # Clean up 717 self.aws.delete(file_name) 718 719 def test_file_close(self): 720 file_name = self.TEST_DATA_PATH + 'close_file' 721 file_size = 5 * 1024 * 1024 + 2000 722 contents = os.urandom(file_size) 723 f = self.aws.open(file_name, 'w') 724 self.assertEqual(f.mode, 'w') 725 f.write(contents) 726 f.close() 727 f.close() # This should not crash. 728 729 with self.aws.open(file_name, 'r') as f: 730 read_contents = f.read() 731 732 self.assertEqual(read_contents, contents) 733 734 # Clean up 735 self.aws.delete(file_name) 736 737 def test_context_manager(self): 738 # Test writing with a context manager. 739 file_name = self.TEST_DATA_PATH + 'context_manager_file' 740 file_size = 1024 741 contents = os.urandom(file_size) 742 with self.aws.open(file_name, 'w') as f: 743 f.write(contents) 744 745 with self.aws.open(file_name, 'r') as f: 746 self.assertEqual(f.read(), contents) 747 748 # Clean up 749 self.aws.delete(file_name) 750 751 def test_list_prefix(self): 752 753 objects = [ 754 ('jerry/pigpen/phil', 5), 755 ('jerry/pigpen/bobby', 3), 756 ('jerry/billy/bobby', 4), 757 ] 758 759 for (object_name, size) in objects: 760 file_name = self.TEST_DATA_PATH + object_name 761 self._insert_random_file(self.aws.client, file_name, size) 762 763 test_cases = [ 764 ( 765 self.TEST_DATA_PATH + 'j', 766 [ 767 ('jerry/pigpen/phil', 5), 768 ('jerry/pigpen/bobby', 3), 769 ('jerry/billy/bobby', 4), 770 ]), 771 ( 772 self.TEST_DATA_PATH + 'jerry/', 773 [ 774 ('jerry/pigpen/phil', 5), 775 ('jerry/pigpen/bobby', 3), 776 ('jerry/billy/bobby', 4), 777 ]), 778 ( 779 self.TEST_DATA_PATH + 'jerry/pigpen/phil', [ 780 ('jerry/pigpen/phil', 5), 781 ]), 782 ] 783 784 for file_pattern, expected_object_names in test_cases: 785 expected_file_names = [(self.TEST_DATA_PATH + object_name, size) 786 for (object_name, size) in expected_object_names] 787 self.assertEqual( 788 set(self.aws.list_prefix(file_pattern).items()), 789 set(expected_file_names)) 790 791 # Clean up 792 for (object_name, size) in objects: 793 self.aws.delete(self.TEST_DATA_PATH + object_name) 794 795 def test_midsize_file(self): 796 file_name = self.TEST_DATA_PATH + 'midsized' 797 file_size = 6 * 1024 * 1024 798 self._insert_random_file(self.aws.client, file_name, file_size) 799 with self.aws.open(file_name, 'r') as f: 800 self.assertEqual(len(f.read()), file_size) 801 self.aws.delete(file_name) 802 803 def test_zerosize_file(self): 804 file_name = self.TEST_DATA_PATH + 'zerosized' 805 file_size = 0 806 self._insert_random_file(self.aws.client, file_name, file_size) 807 with self.aws.open(file_name, 'r') as f: 808 self.assertEqual(len(f.read()), file_size) 809 self.aws.delete(file_name) 810 811 812 if __name__ == '__main__': 813 logging.getLogger().setLevel(logging.INFO) 814 unittest.main()