github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/fileio_test.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Tests for transforms defined in apache_beam.io.fileio.""" 19 20 # pytype: skip-file 21 22 import csv 23 import io 24 import json 25 import logging 26 import os 27 import unittest 28 import uuid 29 import warnings 30 31 import pytest 32 from hamcrest.library.text import stringmatches 33 34 import apache_beam as beam 35 from apache_beam.io import fileio 36 from apache_beam.io.filebasedsink_test import _TestCaseWithTempDirCleanUp 37 from apache_beam.io.filesystem import CompressionTypes 38 from apache_beam.io.filesystems import FileSystems 39 from apache_beam.options.pipeline_options import PipelineOptions 40 from apache_beam.options.pipeline_options import StandardOptions 41 from apache_beam.testing.test_pipeline import TestPipeline 42 from apache_beam.testing.test_stream import TestStream 43 from apache_beam.testing.test_utils import compute_hash 44 from apache_beam.testing.util import assert_that 45 from apache_beam.testing.util import equal_to 46 from apache_beam.testing.util import matches_all 47 from apache_beam.transforms import trigger 48 from apache_beam.transforms.window import FixedWindows 49 from apache_beam.transforms.window import GlobalWindow 50 from apache_beam.transforms.window import IntervalWindow 51 from apache_beam.utils.timestamp import Timestamp 52 53 warnings.filterwarnings( 54 'ignore', category=FutureWarning, module='apache_beam.io.fileio_test') 55 56 57 def _get_file_reader(readable_file): 58 return io.TextIOWrapper(readable_file.open()) 59 60 61 class MatchTest(_TestCaseWithTempDirCleanUp): 62 def test_basic_two_files(self): 63 files = [] 64 tempdir = '%s%s' % (self._new_tempdir(), os.sep) 65 66 # Create a couple files to be matched 67 files.append(self._create_temp_file(dir=tempdir)) 68 files.append(self._create_temp_file(dir=tempdir)) 69 70 with TestPipeline() as p: 71 files_pc = ( 72 p 73 | fileio.MatchFiles(FileSystems.join(tempdir, '*')) 74 | beam.Map(lambda x: x.path)) 75 76 assert_that(files_pc, equal_to(files)) 77 78 def test_match_all_two_directories(self): 79 files = [] 80 directories = [] 81 82 for _ in range(2): 83 # TODO: What about this having to append the ending slash? 84 d = '%s%s' % (self._new_tempdir(), os.sep) 85 directories.append(d) 86 87 files.append(self._create_temp_file(dir=d)) 88 files.append(self._create_temp_file(dir=d)) 89 90 with TestPipeline() as p: 91 files_pc = ( 92 p 93 | beam.Create([FileSystems.join(d, '*') for d in directories]) 94 | fileio.MatchAll() 95 | beam.Map(lambda x: x.path)) 96 97 assert_that(files_pc, equal_to(files)) 98 99 def test_match_files_one_directory_failure1(self): 100 directories = [ 101 '%s%s' % (self._new_tempdir(), os.sep), 102 '%s%s' % (self._new_tempdir(), os.sep) 103 ] 104 105 files = [] 106 files.append(self._create_temp_file(dir=directories[0])) 107 files.append(self._create_temp_file(dir=directories[0])) 108 109 with self.assertRaises(beam.io.filesystem.BeamIOError): 110 with TestPipeline() as p: 111 files_pc = ( 112 p 113 | beam.Create([FileSystems.join(d, '*') for d in directories]) 114 | fileio.MatchAll(fileio.EmptyMatchTreatment.DISALLOW) 115 | beam.Map(lambda x: x.path)) 116 117 assert_that(files_pc, equal_to(files)) 118 119 def test_match_files_one_directory_failure2(self): 120 directories = [ 121 '%s%s' % (self._new_tempdir(), os.sep), 122 '%s%s' % (self._new_tempdir(), os.sep) 123 ] 124 125 files = [] 126 files.append(self._create_temp_file(dir=directories[0])) 127 files.append(self._create_temp_file(dir=directories[0])) 128 129 with TestPipeline() as p: 130 files_pc = ( 131 p 132 | beam.Create([FileSystems.join(d, '*') for d in directories]) 133 | fileio.MatchAll(fileio.EmptyMatchTreatment.ALLOW_IF_WILDCARD) 134 | beam.Map(lambda x: x.path)) 135 136 assert_that(files_pc, equal_to(files)) 137 138 139 class ReadTest(_TestCaseWithTempDirCleanUp): 140 def test_basic_file_name_provided(self): 141 content = 'TestingMyContent\nIn multiple lines\nhaha!' 142 dir = '%s%s' % (self._new_tempdir(), os.sep) 143 self._create_temp_file(dir=dir, content=content) 144 145 with TestPipeline() as p: 146 content_pc = ( 147 p 148 | beam.Create([FileSystems.join(dir, '*')]) 149 | fileio.MatchAll() 150 | fileio.ReadMatches() 151 | beam.FlatMap(lambda f: f.read().decode('utf-8').splitlines())) 152 153 assert_that(content_pc, equal_to(content.splitlines())) 154 155 def test_csv_file_source(self): 156 content = 'name,year,place\ngoogle,1999,CA\nspotify,2006,sweden' 157 rows = [r.split(',') for r in content.split('\n')] 158 159 dir = '%s%s' % (self._new_tempdir(), os.sep) 160 self._create_temp_file(dir=dir, content=content) 161 162 with TestPipeline() as p: 163 content_pc = ( 164 p 165 | beam.Create([FileSystems.join(dir, '*')]) 166 | fileio.MatchAll() 167 | fileio.ReadMatches() 168 | beam.FlatMap(lambda rf: csv.reader(_get_file_reader(rf)))) 169 170 assert_that(content_pc, equal_to(rows)) 171 172 def test_infer_compressed_file(self): 173 dir = '%s%s' % (self._new_tempdir(), os.sep) 174 175 file_contents = b'compressed_contents!' 176 import gzip 177 with gzip.GzipFile(os.path.join(dir, 'compressed.gz'), 'w') as f: 178 f.write(file_contents) 179 180 file_contents2 = b'compressed_contents_bz2!' 181 import bz2 182 with bz2.BZ2File(os.path.join(dir, 'compressed2.bz2'), 'w') as f: 183 f.write(file_contents2) 184 185 with TestPipeline() as p: 186 content_pc = ( 187 p 188 | beam.Create([FileSystems.join(dir, '*')]) 189 | fileio.MatchAll() 190 | fileio.ReadMatches() 191 | beam.Map(lambda rf: rf.open().readline())) 192 193 assert_that(content_pc, equal_to([file_contents, file_contents2])) 194 195 def test_read_bz2_compressed_file_without_suffix(self): 196 dir = '%s%s' % (self._new_tempdir(), os.sep) 197 198 file_contents = b'compressed_contents!' 199 import bz2 200 with bz2.BZ2File(os.path.join(dir, 'compressed'), 'w') as f: 201 f.write(file_contents) 202 203 with TestPipeline() as p: 204 content_pc = ( 205 p 206 | beam.Create([FileSystems.join(dir, '*')]) 207 | fileio.MatchAll() 208 | fileio.ReadMatches() 209 | beam.Map( 210 lambda rf: rf.open(compression_type=CompressionTypes.BZIP2).read( 211 len(file_contents)))) 212 213 assert_that(content_pc, equal_to([file_contents])) 214 215 def test_read_gzip_compressed_file_without_suffix(self): 216 dir = '%s%s' % (self._new_tempdir(), os.sep) 217 218 file_contents = b'compressed_contents!' 219 import gzip 220 with gzip.GzipFile(os.path.join(dir, 'compressed'), 'w') as f: 221 f.write(file_contents) 222 223 with TestPipeline() as p: 224 content_pc = ( 225 p 226 | beam.Create([FileSystems.join(dir, '*')]) 227 | fileio.MatchAll() 228 | fileio.ReadMatches() 229 | beam.Map( 230 lambda rf: rf.open(compression_type=CompressionTypes.GZIP).read( 231 len(file_contents)))) 232 233 assert_that(content_pc, equal_to([file_contents])) 234 235 def test_string_filenames_and_skip_directory(self): 236 content = 'thecontent\n' 237 files = [] 238 tempdir = '%s%s' % (self._new_tempdir(), os.sep) 239 240 # Create a couple files to be matched 241 files.append(self._create_temp_file(dir=tempdir, content=content)) 242 files.append(self._create_temp_file(dir=tempdir, content=content)) 243 244 with TestPipeline() as p: 245 contents_pc = ( 246 p 247 | beam.Create(files + ['%s/' % tempdir]) 248 | fileio.ReadMatches() 249 | beam.FlatMap(lambda x: x.read().decode('utf-8').splitlines())) 250 251 assert_that(contents_pc, equal_to(content.splitlines() * 2)) 252 253 def test_fail_on_directories(self): 254 content = 'thecontent\n' 255 files = [] 256 tempdir = '%s%s' % (self._new_tempdir(), os.sep) 257 258 # Create a couple files to be matched 259 files.append(self._create_temp_file(dir=tempdir, content=content)) 260 files.append(self._create_temp_file(dir=tempdir, content=content)) 261 262 with self.assertRaises(beam.io.filesystem.BeamIOError): 263 with TestPipeline() as p: 264 _ = ( 265 p 266 | beam.Create(files + ['%s/' % tempdir]) 267 | fileio.ReadMatches(skip_directories=False) 268 | beam.Map(lambda x: x.read_utf8())) 269 270 271 class MatchIntegrationTest(unittest.TestCase): 272 273 INPUT_FILE = 'gs://dataflow-samples/shakespeare/kinglear.txt' 274 KINGLEAR_CHECKSUM = 'f418b25f1507f5a901257026b035ac2857a7ab87' 275 INPUT_FILE_LARGE = ( 276 'gs://dataflow-samples/wikipedia_edits/wiki_data-00000000000*.json') 277 278 WIKI_FILES = [ 279 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000000.json', 280 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000001.json', 281 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000002.json', 282 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000003.json', 283 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000004.json', 284 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000005.json', 285 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000006.json', 286 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000007.json', 287 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000008.json', 288 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000009.json', 289 ] 290 291 def setUp(self): 292 self.test_pipeline = TestPipeline(is_integration_test=True) 293 294 @pytest.mark.it_postcommit 295 def test_transform_on_gcs(self): 296 args = self.test_pipeline.get_full_options_as_args() 297 298 with beam.Pipeline(argv=args) as p: 299 matches_pc = ( 300 p 301 | beam.Create([self.INPUT_FILE, self.INPUT_FILE_LARGE]) 302 | fileio.MatchAll() 303 | 'GetPath' >> beam.Map(lambda metadata: metadata.path)) 304 305 assert_that( 306 matches_pc, 307 equal_to([self.INPUT_FILE] + self.WIKI_FILES), 308 label='Matched Files') 309 310 checksum_pc = ( 311 p 312 | 'SingleFile' >> beam.Create([self.INPUT_FILE]) 313 | 'MatchOneAll' >> fileio.MatchAll() 314 | fileio.ReadMatches() 315 | 'ReadIn' >> beam.Map(lambda x: x.read_utf8().split('\n')) 316 | 'Checksums' >> beam.Map(compute_hash)) 317 318 assert_that( 319 checksum_pc, 320 equal_to([self.KINGLEAR_CHECKSUM]), 321 label='Assert Checksums') 322 323 324 class MatchContinuouslyTest(_TestCaseWithTempDirCleanUp): 325 def test_with_deduplication(self): 326 files = [] 327 tempdir = '%s%s' % (self._new_tempdir(), os.sep) 328 329 # Create a file to be matched before pipeline 330 files.append(self._create_temp_file(dir=tempdir)) 331 # Add file name that will be created mid-pipeline 332 files.append(FileSystems.join(tempdir, 'extra')) 333 334 interval = 0.2 335 start = Timestamp.now() 336 stop = start + interval + 0.1 337 338 def _create_extra_file(element): 339 writer = FileSystems.create(FileSystems.join(tempdir, 'extra')) 340 writer.close() 341 return element.path 342 343 with TestPipeline() as p: 344 match_continiously = ( 345 p 346 | fileio.MatchContinuously( 347 file_pattern=FileSystems.join(tempdir, '*'), 348 interval=interval, 349 start_timestamp=start, 350 stop_timestamp=stop) 351 | beam.Map(_create_extra_file)) 352 353 assert_that(match_continiously, equal_to(files)) 354 355 def test_without_deduplication(self): 356 interval = 0.2 357 start = Timestamp.now() 358 stop = start + interval + 0.1 359 360 files = [] 361 tempdir = '%s%s' % (self._new_tempdir(), os.sep) 362 363 # Create a file to be matched before pipeline starts 364 file = self._create_temp_file(dir=tempdir) 365 # Add file twice, since it will be matched for every interval 366 files += [file, file] 367 # Add file name that will be created mid-pipeline 368 files.append(FileSystems.join(tempdir, 'extra')) 369 370 def _create_extra_file(element): 371 writer = FileSystems.create(FileSystems.join(tempdir, 'extra')) 372 writer.close() 373 return element.path 374 375 with TestPipeline() as p: 376 match_continiously = ( 377 p 378 | fileio.MatchContinuously( 379 file_pattern=FileSystems.join(tempdir, '*'), 380 interval=interval, 381 has_deduplication=False, 382 start_timestamp=start, 383 stop_timestamp=stop) 384 | beam.Map(_create_extra_file)) 385 386 assert_that(match_continiously, equal_to(files)) 387 388 def test_match_updated_files(self): 389 files = [] 390 tempdir = '%s%s' % (self._new_tempdir(), os.sep) 391 392 def _create_extra_file(element): 393 writer = FileSystems.create(FileSystems.join(tempdir, 'extra')) 394 writer.close() 395 return element.path 396 397 # Create two files to be matched before pipeline 398 files.append(self._create_temp_file(dir=tempdir)) 399 writer = FileSystems.create(FileSystems.join(tempdir, 'extra')) 400 writer.close() 401 402 # Add file name that will be created mid-pipeline 403 files.append(FileSystems.join(tempdir, 'extra')) 404 files.append(FileSystems.join(tempdir, 'extra')) 405 406 interval = 0.2 407 start = Timestamp.now() 408 stop = start + interval + 0.1 409 410 with TestPipeline() as p: 411 match_continiously = ( 412 p 413 | fileio.MatchContinuously( 414 file_pattern=FileSystems.join(tempdir, '*'), 415 interval=interval, 416 start_timestamp=start, 417 stop_timestamp=stop, 418 match_updated_files=True) 419 | beam.Map(_create_extra_file)) 420 421 assert_that(match_continiously, equal_to(files)) 422 423 424 class WriteFilesTest(_TestCaseWithTempDirCleanUp): 425 426 SIMPLE_COLLECTION = [ 427 { 428 'project': 'beam', 'foundation': 'apache' 429 }, 430 { 431 'project': 'prometheus', 'foundation': 'cncf' 432 }, 433 { 434 'project': 'flink', 'foundation': 'apache' 435 }, 436 { 437 'project': 'grpc', 'foundation': 'cncf' 438 }, 439 { 440 'project': 'spark', 'foundation': 'apache' 441 }, 442 { 443 'project': 'kubernetes', 'foundation': 'cncf' 444 }, 445 { 446 'project': 'spark', 'foundation': 'apache' 447 }, 448 { 449 'project': 'knative', 'foundation': 'cncf' 450 }, 451 { 452 'project': 'linux', 'foundation': 'linux' 453 }, 454 ] 455 456 LARGER_COLLECTION = ['{:05d}'.format(i) for i in range(200)] 457 458 CSV_HEADERS = ['project', 'foundation'] 459 460 SIMPLE_COLLECTION_VALIDATION_SET = {(elm['project'], elm['foundation']) 461 for elm in SIMPLE_COLLECTION} 462 463 class CsvSink(fileio.TextSink): 464 def __init__(self, headers): 465 self.headers = headers 466 467 def write(self, record): 468 self._fh.write(','.join([record[h] for h in self.headers]).encode('utf8')) 469 self._fh.write('\n'.encode('utf8')) 470 471 class JsonSink(fileio.TextSink): 472 def write(self, record): 473 self._fh.write(json.dumps(record).encode('utf8')) 474 self._fh.write('\n'.encode('utf8')) 475 476 def test_write_to_single_file_batch(self): 477 478 dir = self._new_tempdir() 479 480 with TestPipeline() as p: 481 _ = ( 482 p 483 | beam.Create(WriteFilesTest.SIMPLE_COLLECTION) 484 | "Serialize" >> beam.Map(json.dumps) 485 | beam.io.fileio.WriteToFiles(path=dir)) 486 487 with TestPipeline() as p: 488 result = ( 489 p 490 | fileio.MatchFiles(FileSystems.join(dir, '*')) 491 | fileio.ReadMatches() 492 | beam.FlatMap(lambda f: f.read_utf8().strip().split('\n')) 493 | beam.Map(json.loads)) 494 495 assert_that(result, equal_to([row for row in self.SIMPLE_COLLECTION])) 496 497 def test_write_to_dynamic_destination(self): 498 499 sink_params = [ 500 fileio.TextSink, # pass a type signature 501 fileio.TextSink() # pass a FileSink object 502 ] 503 504 for sink in sink_params: 505 dir = self._new_tempdir() 506 507 with TestPipeline() as p: 508 _ = ( 509 p 510 | "Create" >> beam.Create(range(100)) 511 | beam.Map(lambda x: str(x)) 512 | fileio.WriteToFiles( 513 path=dir, 514 destination=lambda n: "odd" if int(n) % 2 else "even", 515 sink=sink, 516 file_naming=fileio.destination_prefix_naming("test"))) 517 518 with TestPipeline() as p: 519 result = ( 520 p 521 | fileio.MatchFiles(FileSystems.join(dir, '*')) 522 | fileio.ReadMatches() 523 | beam.Map( 524 lambda f: ( 525 os.path.basename(f.metadata.path).split('-')[0], 526 sorted(map(int, f.read_utf8().strip().split('\n')))))) 527 528 assert_that( 529 result, 530 equal_to([('odd', list(range(1, 100, 2))), 531 ('even', list(range(0, 100, 2)))])) 532 533 def test_write_to_different_file_types_some_spilling(self): 534 535 dir = self._new_tempdir() 536 537 with TestPipeline() as p: 538 _ = ( 539 p 540 | beam.Create(WriteFilesTest.SIMPLE_COLLECTION) 541 | beam.io.fileio.WriteToFiles( 542 path=dir, 543 destination=lambda record: record['foundation'], 544 sink=lambda dest: ( 545 WriteFilesTest.CsvSink(WriteFilesTest.CSV_HEADERS) 546 if dest == 'apache' else WriteFilesTest.JsonSink()), 547 file_naming=fileio.destination_prefix_naming(), 548 max_writers_per_bundle=1)) 549 550 with TestPipeline() as p: 551 cncf_res = ( 552 p 553 | fileio.MatchFiles(FileSystems.join(dir, 'cncf*')) 554 | fileio.ReadMatches() 555 | beam.FlatMap(lambda f: f.read_utf8().strip().split('\n')) 556 | beam.Map(json.loads)) 557 558 apache_res = ( 559 p 560 | 561 "MatchApache" >> fileio.MatchFiles(FileSystems.join(dir, 'apache*')) 562 | "ReadApache" >> fileio.ReadMatches() 563 | "MapApache" >> 564 beam.FlatMap(lambda rf: csv.reader(_get_file_reader(rf)))) 565 566 assert_that( 567 cncf_res, 568 equal_to([ 569 row for row in self.SIMPLE_COLLECTION 570 if row['foundation'] == 'cncf' 571 ]), 572 label='verifyCNCF') 573 574 assert_that( 575 apache_res, 576 equal_to([[row['project'], row['foundation']] 577 for row in self.SIMPLE_COLLECTION 578 if row['foundation'] == 'apache']), 579 label='verifyApache') 580 581 @unittest.skip('https://github.com/apache/beam/issues/21269') 582 def test_find_orphaned_files(self): 583 dir = self._new_tempdir() 584 585 write_transform = beam.io.fileio.WriteToFiles(path=dir) 586 587 def write_orphaned_file(temp_dir, writer_key): 588 temp_dir_path = FileSystems.join(dir, temp_dir) 589 590 file_prefix_dir = FileSystems.join( 591 temp_dir_path, str(abs(hash(writer_key)))) 592 593 file_name = '%s_%s' % (file_prefix_dir, uuid.uuid4()) 594 with FileSystems.create(file_name) as f: 595 f.write(b'Hello y\'all') 596 597 return file_name 598 599 with TestPipeline() as p: 600 _ = ( 601 p 602 | beam.Create(WriteFilesTest.SIMPLE_COLLECTION) 603 | "Serialize" >> beam.Map(json.dumps) 604 | write_transform) 605 606 # Pre-create the temp directory. 607 temp_dir_path = FileSystems.mkdirs( 608 FileSystems.join(dir, write_transform._temp_directory.get())) 609 write_orphaned_file( 610 write_transform._temp_directory.get(), (None, GlobalWindow())) 611 f2 = write_orphaned_file( 612 write_transform._temp_directory.get(), ('other-dest', GlobalWindow())) 613 614 temp_dir_path = FileSystems.join(dir, write_transform._temp_directory.get()) 615 leftovers = FileSystems.match(['%s%s*' % (temp_dir_path, os.sep)]) 616 found_files = [m.path for m in leftovers[0].metadata_list] 617 self.assertListEqual(found_files, [f2]) 618 619 def test_write_to_different_file_types(self): 620 621 dir = self._new_tempdir() 622 623 with TestPipeline() as p: 624 _ = ( 625 p 626 | beam.Create(WriteFilesTest.SIMPLE_COLLECTION) 627 | beam.io.fileio.WriteToFiles( 628 path=dir, 629 destination=lambda record: record['foundation'], 630 sink=lambda dest: ( 631 WriteFilesTest.CsvSink(WriteFilesTest.CSV_HEADERS) 632 if dest == 'apache' else WriteFilesTest.JsonSink()), 633 file_naming=fileio.destination_prefix_naming())) 634 635 with TestPipeline() as p: 636 cncf_res = ( 637 p 638 | fileio.MatchFiles(FileSystems.join(dir, 'cncf*')) 639 | fileio.ReadMatches() 640 | beam.FlatMap(lambda f: f.read_utf8().strip().split('\n')) 641 | beam.Map(json.loads)) 642 643 apache_res = ( 644 p 645 | 646 "MatchApache" >> fileio.MatchFiles(FileSystems.join(dir, 'apache*')) 647 | "ReadApache" >> fileio.ReadMatches() 648 | "MapApache" >> 649 beam.FlatMap(lambda rf: csv.reader(_get_file_reader(rf)))) 650 651 assert_that( 652 cncf_res, 653 equal_to([ 654 row for row in self.SIMPLE_COLLECTION 655 if row['foundation'] == 'cncf' 656 ]), 657 label='verifyCNCF') 658 659 assert_that( 660 apache_res, 661 equal_to([[row['project'], row['foundation']] 662 for row in self.SIMPLE_COLLECTION 663 if row['foundation'] == 'apache']), 664 label='verifyApache') 665 666 def record_dofn(self): 667 class RecordDoFn(beam.DoFn): 668 def process(self, element): 669 WriteFilesTest.all_records.append(element) 670 671 return RecordDoFn() 672 673 def test_streaming_complex_timing(self): 674 # Use state on the TestCase class, since other references would be pickled 675 # into a closure and not have the desired side effects. 676 # 677 # TODO(https://github.com/apache/beam/issues/18987): Use assert_that after 678 # it works for the cases here in streaming mode. 679 WriteFilesTest.all_records = [] 680 681 dir = '%s%s' % (self._new_tempdir(), os.sep) 682 683 # Setting up the input (TestStream) 684 ts = TestStream().advance_watermark_to(0) 685 for elm in WriteFilesTest.LARGER_COLLECTION: 686 timestamp = int(elm) 687 688 ts.add_elements([('key', '%s' % elm)]) 689 if timestamp % 5 == 0 and timestamp != 0: 690 # TODO(https://github.com/apache/beam/issues/18721): Add many firings 691 # per window after getting PaneInfo. 692 ts.advance_processing_time(5) 693 ts.advance_watermark_to(timestamp) 694 ts.advance_watermark_to_infinity() 695 696 def no_colon_file_naming(*args): 697 file_name = fileio.destination_prefix_naming()(*args) 698 return file_name.replace(':', '_') 699 700 # The pipeline that we are testing 701 options = PipelineOptions() 702 options.view_as(StandardOptions).streaming = True 703 with TestPipeline(options=options) as p: 704 res = ( 705 p 706 | ts 707 | beam.WindowInto( 708 FixedWindows(10), 709 trigger=trigger.AfterWatermark(), 710 accumulation_mode=trigger.AccumulationMode.DISCARDING) 711 | beam.GroupByKey() 712 | beam.FlatMap(lambda x: x[1])) 713 # Triggering after 5 processing-time seconds, and on the watermark. Also 714 # discarding old elements. 715 716 _ = ( 717 res 718 | beam.io.fileio.WriteToFiles( 719 path=dir, 720 file_naming=no_colon_file_naming, 721 max_writers_per_bundle=0) 722 | beam.Map(lambda fr: FileSystems.join(dir, fr.file_name)) 723 | beam.ParDo(self.record_dofn())) 724 725 # Verification pipeline 726 with TestPipeline() as p: 727 files = (p | beam.io.fileio.MatchFiles(FileSystems.join(dir, '*'))) 728 729 file_names = (files | beam.Map(lambda fm: fm.path)) 730 731 file_contents = ( 732 files 733 | beam.io.fileio.ReadMatches() 734 | beam.Map( 735 lambda rf: (rf.metadata.path, rf.read_utf8().strip().split('\n'))) 736 ) 737 738 content = ( 739 file_contents 740 | beam.FlatMap(lambda fc: [ln.strip() for ln in fc[1]])) 741 742 assert_that( 743 file_names, 744 equal_to(WriteFilesTest.all_records), 745 label='AssertFilesMatch') 746 assert_that( 747 content, 748 matches_all(WriteFilesTest.LARGER_COLLECTION), 749 label='AssertContentsMatch') 750 751 def test_streaming_different_file_types(self): 752 dir = self._new_tempdir() 753 input = iter(WriteFilesTest.SIMPLE_COLLECTION) 754 ts = ( 755 TestStream().advance_watermark_to(0).add_elements( 756 [next(input), next(input)]).advance_watermark_to(10).add_elements( 757 [next(input), 758 next(input)]).advance_watermark_to(20).add_elements([ 759 next(input), next(input) 760 ]).advance_watermark_to(30).add_elements([ 761 next(input), next(input) 762 ]).advance_watermark_to(40).advance_watermark_to_infinity()) 763 764 def no_colon_file_naming(*args): 765 file_name = fileio.destination_prefix_naming()(*args) 766 return file_name.replace(':', '_') 767 768 with TestPipeline() as p: 769 _ = ( 770 p 771 | ts 772 | beam.WindowInto(FixedWindows(10)) 773 | beam.io.fileio.WriteToFiles( 774 path=dir, 775 destination=lambda record: record['foundation'], 776 sink=lambda dest: ( 777 WriteFilesTest.CsvSink(WriteFilesTest.CSV_HEADERS) 778 if dest == 'apache' else WriteFilesTest.JsonSink()), 779 file_naming=no_colon_file_naming, 780 max_writers_per_bundle=0, 781 )) 782 783 with TestPipeline() as p: 784 cncf_files = ( 785 p 786 | fileio.MatchFiles(FileSystems.join(dir, 'cncf*')) 787 | "CncfFileNames" >> beam.Map(lambda fm: fm.path)) 788 789 apache_files = ( 790 p 791 | 792 "MatchApache" >> fileio.MatchFiles(FileSystems.join(dir, 'apache*')) 793 | "ApacheFileNames" >> beam.Map(lambda fm: fm.path)) 794 795 assert_that( 796 cncf_files, 797 matches_all([ 798 stringmatches.matches_regexp( 799 '.*cncf-1970-01-01T00_00_00-1970-01-01T00_00_10.*'), 800 stringmatches.matches_regexp( 801 '.*cncf-1970-01-01T00_00_10-1970-01-01T00_00_20.*'), 802 stringmatches.matches_regexp( 803 '.*cncf-1970-01-01T00_00_20-1970-01-01T00_00_30.*'), 804 stringmatches.matches_regexp( 805 '.*cncf-1970-01-01T00_00_30-1970-01-01T00_00_40.*') 806 ]), 807 label='verifyCNCFFiles') 808 809 assert_that( 810 apache_files, 811 matches_all([ 812 stringmatches.matches_regexp( 813 '.*apache-1970-01-01T00_00_00-1970-01-01T00_00_10.*'), 814 stringmatches.matches_regexp( 815 '.*apache-1970-01-01T00_00_10-1970-01-01T00_00_20.*'), 816 stringmatches.matches_regexp( 817 '.*apache-1970-01-01T00_00_20-1970-01-01T00_00_30.*'), 818 stringmatches.matches_regexp( 819 '.*apache-1970-01-01T00_00_30-1970-01-01T00_00_40.*') 820 ]), 821 label='verifyApacheFiles') 822 823 def test_shard_naming(self): 824 namer = fileio.default_file_naming(prefix='/path/to/file', suffix='.txt') 825 self.assertEqual( 826 namer(GlobalWindow(), None, None, None, None, None), 827 '/path/to/file.txt') 828 self.assertEqual( 829 namer(GlobalWindow(), None, 1, 5, None, None), 830 '/path/to/file-00001-of-00005.txt') 831 self.assertEqual( 832 namer(GlobalWindow(), None, 1, 5, 'gz', None), 833 '/path/to/file-00001-of-00005.txt.gz') 834 self.assertEqual( 835 namer(IntervalWindow(0, 100), None, 1, 5, None, None), 836 '/path/to/file' 837 '-1970-01-01T00:00:00-1970-01-01T00:01:40-00001-of-00005.txt') 838 839 840 if __name__ == '__main__': 841 logging.getLogger().setLevel(logging.INFO) 842 unittest.main()