github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/fileio.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """``PTransforms`` for manipulating files in Apache Beam. 19 20 Provides reading ``PTransform``\\s, ``MatchFiles``, 21 ``MatchAll``, that produces a ``PCollection`` of records representing a file 22 and its metadata; and ``ReadMatches``, which takes in a ``PCollection`` of file 23 metadata records, and produces a ``PCollection`` of ``ReadableFile`` objects. 24 These transforms currently do not support splitting by themselves. 25 26 Writing to Files 27 ================ 28 29 The transforms in this file include ``WriteToFiles``, which allows you to write 30 a ``beam.PCollection`` to files, and gives you many options to customize how to 31 do this. 32 33 The ``WriteToFiles`` transform supports bounded and unbounded PCollections 34 (i.e. it can be used both batch and streaming pipelines). For streaming 35 pipelines, it currently does not have support for multiple trigger firings 36 on the same window. 37 38 File Naming 39 ----------- 40 One of the parameters received by ``WriteToFiles`` is a function specifying how 41 to name the files that are written. This is a function that takes in the 42 following parameters: 43 44 - window 45 - pane 46 - shard_index 47 - total_shards 48 - compression 49 - destination 50 51 It should return a file name that is unique for a combination of these 52 parameters. 53 54 The default naming strategy is to name files 55 in the format 56 `$prefix-$start-$end-$pane-$shard-of-$numShards$suffix$compressionSuffix`, 57 where: 58 59 - `$prefix` is, by default, `"output"`. 60 - `$start` and `$end` are the boundaries of the window for the data being 61 written. These are omitted if we're using the Global window. 62 - `$pane` is the index for the number of firing for a window. 63 - `$shard` and `$numShards` are the current shard number, and the total number 64 of shards for this window firing. 65 - `$suffix` is, by default, an empty string, but it can be set by the user via 66 ``default_file_naming``. 67 68 Dynamic Destinations 69 -------------------- 70 If the elements in the input ``beam.PCollection`` can be partitioned into groups 71 that should be treated differently (e.g. some events are to be stored as CSV, 72 while some others are to be stored as Avro files), it is possible to do this 73 by passing a `destination` parameter to ``WriteToFiles``. Something like the 74 following:: 75 76 my_pcollection | beam.io.fileio.WriteToFiles( 77 path='/my/file/path', 78 destination=lambda record: 'avro' if record['type'] == 'A' else 'csv', 79 sink=lambda dest: AvroSink() if dest == 'avro' else CsvSink(), 80 file_naming=beam.io.fileio.destination_prefix_naming()) 81 82 In this transform, depending on the type of a record, it will be written down to 83 a destination named `'avro'`, or `'csv'`. The value returned by the 84 `destination` call is then passed to the `sink` call, to determine what sort of 85 sink will be used for each destination. The return type of the `destination` 86 parameter can be anything, as long as elements can be grouped by it. 87 """ 88 89 # pytype: skip-file 90 91 import collections 92 import logging 93 import os 94 import random 95 import uuid 96 from collections import namedtuple 97 from functools import partial 98 from typing import TYPE_CHECKING 99 from typing import Any 100 from typing import BinaryIO # pylint: disable=unused-import 101 from typing import Callable 102 from typing import DefaultDict 103 from typing import Dict 104 from typing import Iterable 105 from typing import List 106 from typing import Tuple 107 from typing import Union 108 109 import apache_beam as beam 110 from apache_beam.io import filesystem 111 from apache_beam.io import filesystems 112 from apache_beam.io.filesystem import BeamIOError 113 from apache_beam.io.filesystem import CompressionTypes 114 from apache_beam.options.pipeline_options import GoogleCloudOptions 115 from apache_beam.options.value_provider import StaticValueProvider 116 from apache_beam.options.value_provider import ValueProvider 117 from apache_beam.transforms.periodicsequence import PeriodicImpulse 118 from apache_beam.transforms.userstate import CombiningValueStateSpec 119 from apache_beam.transforms.window import FixedWindows 120 from apache_beam.transforms.window import GlobalWindow 121 from apache_beam.transforms.window import IntervalWindow 122 from apache_beam.utils.timestamp import MAX_TIMESTAMP 123 from apache_beam.utils.timestamp import Timestamp 124 125 if TYPE_CHECKING: 126 from apache_beam.transforms.window import BoundedWindow 127 128 __all__ = [ 129 'EmptyMatchTreatment', 130 'MatchFiles', 131 'MatchAll', 132 'MatchContinuously', 133 'ReadableFile', 134 'ReadMatches', 135 'WriteToFiles' 136 ] 137 138 _LOGGER = logging.getLogger(__name__) 139 140 FileMetadata = namedtuple("FileMetadata", "mime_type compression_type") 141 142 CreateFileMetadataFn = Callable[[str, str], FileMetadata] 143 144 145 class EmptyMatchTreatment(object): 146 """How to treat empty matches in ``MatchAll`` and ``MatchFiles`` transforms. 147 148 If empty matches are disallowed, an error will be thrown if a pattern does not 149 match any files.""" 150 151 ALLOW = 'ALLOW' 152 DISALLOW = 'DISALLOW' 153 ALLOW_IF_WILDCARD = 'ALLOW_IF_WILDCARD' 154 155 @staticmethod 156 def allow_empty_match(pattern, setting): 157 if setting == EmptyMatchTreatment.ALLOW: 158 return True 159 elif setting == EmptyMatchTreatment.ALLOW_IF_WILDCARD and '*' in pattern: 160 return True 161 elif setting == EmptyMatchTreatment.DISALLOW: 162 return False 163 else: 164 raise ValueError(setting) 165 166 167 class _MatchAllFn(beam.DoFn): 168 def __init__(self, empty_match_treatment): 169 self._empty_match_treatment = empty_match_treatment 170 171 def process(self, file_pattern: str) -> List[filesystem.FileMetadata]: 172 # TODO: Should we batch the lookups? 173 match_results = filesystems.FileSystems.match([file_pattern]) 174 match_result = match_results[0] 175 176 if (not match_result.metadata_list and 177 not EmptyMatchTreatment.allow_empty_match(file_pattern, 178 self._empty_match_treatment)): 179 raise BeamIOError( 180 'Empty match for pattern %s. Disallowed.' % file_pattern) 181 182 return match_result.metadata_list 183 184 185 class MatchFiles(beam.PTransform): 186 """Matches a file pattern using ``FileSystems.match``. 187 188 This ``PTransform`` returns a ``PCollection`` of matching files in the form 189 of ``FileMetadata`` objects.""" 190 def __init__( 191 self, 192 file_pattern: str, 193 empty_match_treatment=EmptyMatchTreatment.ALLOW_IF_WILDCARD): 194 self._file_pattern = file_pattern 195 self._empty_match_treatment = empty_match_treatment 196 197 def expand(self, pcoll) -> beam.PCollection[filesystem.FileMetadata]: 198 return pcoll.pipeline | beam.Create([self._file_pattern]) | MatchAll() 199 200 201 class MatchAll(beam.PTransform): 202 """Matches file patterns from the input PCollection via ``FileSystems.match``. 203 204 This ``PTransform`` returns a ``PCollection`` of matching files in the form 205 of ``FileMetadata`` objects.""" 206 def __init__(self, empty_match_treatment=EmptyMatchTreatment.ALLOW): 207 self._empty_match_treatment = empty_match_treatment 208 209 def expand( 210 self, 211 pcoll: beam.PCollection, 212 ) -> beam.PCollection[filesystem.FileMetadata]: 213 return pcoll | beam.ParDo(_MatchAllFn(self._empty_match_treatment)) 214 215 216 class ReadableFile(object): 217 """A utility class for accessing files.""" 218 def __init__(self, metadata, compression=None): 219 self.metadata = metadata 220 self._compression = compression 221 222 def open(self, mime_type='text/plain', compression_type=None): 223 compression = ( 224 compression_type or self._compression or 225 filesystems.CompressionTypes.AUTO) 226 return filesystems.FileSystems.open( 227 self.metadata.path, mime_type=mime_type, compression_type=compression) 228 229 def read(self, mime_type='application/octet-stream'): 230 return self.open(mime_type).read() 231 232 def read_utf8(self): 233 return self.open().read().decode('utf-8') 234 235 236 class _ReadMatchesFn(beam.DoFn): 237 def __init__(self, compression, skip_directories): 238 self._compression = compression 239 self._skip_directories = skip_directories 240 241 def process( 242 self, 243 file_metadata: Union[str, filesystem.FileMetadata], 244 ) -> Iterable[ReadableFile]: 245 metadata = ( 246 filesystem.FileMetadata(file_metadata, 0) if isinstance( 247 file_metadata, str) else file_metadata) 248 249 if ((metadata.path.endswith('/') or metadata.path.endswith('\\')) and 250 self._skip_directories): 251 return 252 elif metadata.path.endswith('/') or metadata.path.endswith('\\'): 253 raise BeamIOError( 254 'Directories are not allowed in ReadMatches transform.' 255 'Found %s.' % metadata.path) 256 257 # TODO: Mime type? Other arguments? Maybe arguments passed in to transform? 258 yield ReadableFile(metadata, self._compression) 259 260 261 class MatchContinuously(beam.PTransform): 262 """Checks for new files for a given pattern every interval. 263 264 This ``PTransform`` returns a ``PCollection`` of matching files in the form 265 of ``FileMetadata`` objects. 266 267 MatchContinuously is experimental. No backwards-compatibility 268 guarantees. 269 """ 270 def __init__( 271 self, 272 file_pattern, 273 interval=360.0, 274 has_deduplication=True, 275 start_timestamp=Timestamp.now(), 276 stop_timestamp=MAX_TIMESTAMP, 277 match_updated_files=False, 278 apply_windowing=False, 279 empty_match_treatment=EmptyMatchTreatment.ALLOW): 280 """Initializes a MatchContinuously transform. 281 282 Args: 283 file_pattern: The file path to read from. 284 interval: Interval at which to check for files in seconds. 285 has_deduplication: Whether files already read are discarded or not. 286 start_timestamp: Timestamp for start file checking. 287 stop_timestamp: Timestamp after which no more files will be checked. 288 match_updated_files: (When has_deduplication is set to True) whether match 289 file with timestamp changes. 290 apply_windowing: Whether each element should be assigned to 291 individual window. If false, all elements will reside in global window. 292 """ 293 294 self.file_pattern = file_pattern 295 self.interval = interval 296 self.has_deduplication = has_deduplication 297 self.start_ts = start_timestamp 298 self.stop_ts = stop_timestamp 299 self.match_upd = match_updated_files 300 self.apply_windowing = apply_windowing 301 self.empty_match_treatment = empty_match_treatment 302 303 def expand(self, pbegin) -> beam.PCollection[filesystem.FileMetadata]: 304 # invoke periodic impulse 305 impulse = pbegin | PeriodicImpulse( 306 start_timestamp=self.start_ts, 307 stop_timestamp=self.stop_ts, 308 fire_interval=self.interval) 309 310 # match file pattern periodically 311 match_files = ( 312 impulse 313 | 'GetFilePattern' >> beam.Map(lambda x: self.file_pattern) 314 | MatchAll(self.empty_match_treatment)) 315 316 # apply deduplication strategy if required 317 if self.has_deduplication: 318 # Making a Key Value so each file has its own state. 319 match_files = match_files | 'ToKV' >> beam.Map(lambda x: (x.path, x)) 320 if self.match_upd: 321 match_files = match_files | 'RemoveOldAlreadyRead' >> beam.ParDo( 322 _RemoveOldDuplicates()) 323 else: 324 match_files = match_files | 'RemoveAlreadyRead' >> beam.ParDo( 325 _RemoveDuplicates()) 326 327 # apply windowing if required. Apply at last because deduplication relies on 328 # the global window. 329 if self.apply_windowing: 330 match_files = match_files | beam.WindowInto(FixedWindows(self.interval)) 331 332 return match_files 333 334 335 class ReadMatches(beam.PTransform): 336 """Converts each result of MatchFiles() or MatchAll() to a ReadableFile. 337 338 This helps read in a file's contents or obtain a file descriptor.""" 339 def __init__(self, compression=None, skip_directories=True): 340 self._compression = compression 341 self._skip_directories = skip_directories 342 343 def expand( 344 self, 345 pcoll: beam.PCollection[Union[str, filesystem.FileMetadata]], 346 ) -> beam.PCollection[ReadableFile]: 347 return pcoll | beam.ParDo( 348 _ReadMatchesFn(self._compression, self._skip_directories)) 349 350 351 class FileSink(object): 352 """Specifies how to write elements to individual files in ``WriteToFiles``. 353 354 A Sink class must implement the following: 355 356 - The ``open`` method, which initializes writing to a file handler (it is not 357 responsible for opening the file handler itself). 358 - The ``write`` method, which writes an element to the file that was passed 359 in ``open``. 360 - The ``flush`` method, which flushes any buffered state. This is most often 361 called before closing a file (but not exclusively called in that 362 situation). The sink is not responsible for closing the file handler. 363 A Sink class can override the following: 364 - The ``create_metadata`` method, which creates all metadata passed to 365 Filesystems.create. 366 """ 367 def create_metadata( 368 self, destination: str, full_file_name: str) -> FileMetadata: 369 return FileMetadata( 370 mime_type="application/octet-stream", 371 compression_type=CompressionTypes.AUTO) 372 373 def open(self, fh): 374 # type: (BinaryIO) -> None 375 raise NotImplementedError 376 377 def write(self, record): 378 raise NotImplementedError 379 380 def flush(self): 381 raise NotImplementedError 382 383 384 @beam.typehints.with_input_types(str) 385 class TextSink(FileSink): 386 """A sink that encodes utf8 elements, and writes to file handlers. 387 388 This sink simply calls file_handler.write(record.encode('utf8') + '\n') on all 389 records that come into it. 390 """ 391 def open(self, fh): 392 self._fh = fh 393 394 def write(self, record): 395 self._fh.write(record.encode('utf8')) 396 self._fh.write(b'\n') 397 398 def flush(self): 399 self._fh.flush() 400 401 402 def prefix_naming(prefix): 403 return default_file_naming(prefix) 404 405 406 _DEFAULT_FILE_NAME_TEMPLATE = ( 407 '{prefix}-{start}-{end}-{pane}-' 408 '{shard:05d}-of-{total_shards:05d}' 409 '{suffix}{compression}') 410 411 412 def _format_shard( 413 window, pane, shard_index, total_shards, compression, prefix, suffix): 414 kwargs = { 415 'prefix': prefix, 416 'start': '', 417 'end': '', 418 'pane': '', 419 'shard': 0, 420 'total_shards': 0, 421 'suffix': '', 422 'compression': '' 423 } 424 425 if total_shards is not None and shard_index is not None: 426 kwargs['shard'] = int(shard_index) 427 kwargs['total_shards'] = int(total_shards) 428 429 if window != GlobalWindow(): 430 kwargs['start'] = window.start.to_utc_datetime().isoformat() 431 kwargs['end'] = window.end.to_utc_datetime().isoformat() 432 433 # TODO(https://github.com/apache/beam/issues/18721): Add support for PaneInfo 434 # If the PANE is the ONLY firing in the window, we don't add it. 435 #if pane and not (pane.is_first and pane.is_last): 436 # kwargs['pane'] = pane.index 437 438 if suffix: 439 kwargs['suffix'] = suffix 440 441 if compression: 442 kwargs['compression'] = '.%s' % compression 443 444 # Remove separators for unused template parts. 445 format = _DEFAULT_FILE_NAME_TEMPLATE 446 if shard_index is None: 447 format = format.replace('-{shard:05d}', '') 448 if total_shards is None: 449 format = format.replace('-of-{total_shards:05d}', '') 450 for name, value in kwargs.items(): 451 if value in (None, ''): 452 format = format.replace('-{%s}' % name, '') 453 454 return format.format(**kwargs) 455 456 457 FileNaming = Callable[[Any, Any, int, int, Any, str, str], str] 458 459 460 def destination_prefix_naming(suffix=None) -> FileNaming: 461 def _inner(window, pane, shard_index, total_shards, compression, destination): 462 prefix = str(destination) 463 return _format_shard( 464 window, pane, shard_index, total_shards, compression, prefix, suffix) 465 466 return _inner 467 468 469 def default_file_naming(prefix, suffix=None) -> FileNaming: 470 def _inner(window, pane, shard_index, total_shards, compression, destination): 471 return _format_shard( 472 window, pane, shard_index, total_shards, compression, prefix, suffix) 473 474 return _inner 475 476 477 def single_file_naming(prefix, suffix=None) -> FileNaming: 478 def _inner(window, pane, shard_index, total_shards, compression, destination): 479 assert shard_index in (0, None), shard_index 480 assert total_shards in (1, None), total_shards 481 return _format_shard(window, pane, None, None, compression, prefix, suffix) 482 483 return _inner 484 485 486 _FileResult = collections.namedtuple( 487 'FileResult', [ 488 'file_name', 489 'shard_index', 490 'total_shards', 491 'window', 492 'pane', 493 'destination' 494 ]) 495 496 497 # Adding a class to contain PyDoc. 498 class FileResult(_FileResult): 499 """A descriptor of a file that has been written.""" 500 pass 501 502 503 class WriteToFiles(beam.PTransform): 504 r"""Write the incoming PCollection to a set of output files. 505 506 The incoming ``PCollection`` may be bounded or unbounded. 507 508 **Note:** For unbounded ``PCollection``\s, this transform does not support 509 multiple firings per Window (due to the fact that files are named only by 510 their destination, and window, at the moment). 511 512 WriteToFiles is experimental. No backwards-compatibility guarantees. 513 """ 514 515 # We allow up to 20 different destinations to be written in a single bundle. 516 # Too many files will add memory pressure to the worker, so we let it be 20. 517 MAX_NUM_WRITERS_PER_BUNDLE = 20 518 519 DEFAULT_SHARDING = 5 520 521 def __init__( 522 self, 523 path, 524 file_naming=None, 525 destination=None, 526 temp_directory=None, 527 sink=None, 528 shards=None, 529 output_fn=None, 530 max_writers_per_bundle=MAX_NUM_WRITERS_PER_BUNDLE): 531 """Initializes a WriteToFiles transform. 532 533 Args: 534 path (str, ValueProvider): The directory to write files into. 535 file_naming (callable): A callable that takes in a window, pane, 536 shard_index, total_shards and compression; and returns a file name. 537 destination (callable): If this argument is provided, the sink parameter 538 must also be a callable. 539 temp_directory (str, ValueProvider): To ensure atomicity in the transform, 540 the output is written into temporary files, which are written to a 541 directory that is meant to be temporary as well. Once the whole output 542 has been written, the files are moved into their final destination, and 543 given their final names. By default, the temporary directory will be 544 within the temp_location of your pipeline. 545 sink (callable, ~apache_beam.io.fileio.FileSink): The sink to use to write 546 into a file. It should implement the methods of a ``FileSink``. Pass a 547 class signature or an instance of FileSink to this parameter. If none is 548 provided, a ``TextSink`` is used. 549 shards (int): The number of shards per destination and trigger firing. 550 max_writers_per_bundle (int): The number of writers that can be open 551 concurrently in a single worker that's processing one bundle. 552 """ 553 self.path = ( 554 path if isinstance(path, ValueProvider) else StaticValueProvider( 555 str, path)) 556 self.file_naming_fn = file_naming or default_file_naming('output') 557 self.destination_fn = self._get_destination_fn(destination) 558 self._temp_directory = temp_directory 559 self.sink_fn = self._get_sink_fn(sink) 560 self.shards = shards or WriteToFiles.DEFAULT_SHARDING 561 self.output_fn = output_fn or (lambda x: x) 562 563 self._max_num_writers_per_bundle = max_writers_per_bundle 564 565 @staticmethod 566 def _get_sink_fn(input_sink): 567 # type: (...) -> Callable[[Any], FileSink] 568 if isinstance(input_sink, type) and issubclass(input_sink, FileSink): 569 return lambda x: input_sink() 570 elif isinstance(input_sink, FileSink): 571 kls = input_sink.__class__ 572 return lambda x: kls() 573 elif callable(input_sink): 574 return input_sink 575 else: 576 return lambda x: TextSink() 577 578 @staticmethod 579 def _get_destination_fn(destination): 580 # type: (...) -> Callable[[Any], str] 581 if isinstance(destination, ValueProvider): 582 return lambda elm: destination.get() 583 elif callable(destination): 584 return destination 585 else: 586 return lambda elm: destination 587 588 def expand(self, pcoll): 589 p = pcoll.pipeline 590 591 if not self._temp_directory: 592 temp_location = ( 593 p.options.view_as(GoogleCloudOptions).temp_location or 594 self.path.get()) 595 dir_uid = str(uuid.uuid4()) 596 self._temp_directory = StaticValueProvider( 597 str, filesystems.FileSystems.join(temp_location, '.temp%s' % dir_uid)) 598 _LOGGER.info('Added temporary directory %s', self._temp_directory.get()) 599 600 output = ( 601 pcoll 602 | beam.ParDo( 603 _WriteUnshardedRecordsFn( 604 base_path=self._temp_directory, 605 destination_fn=self.destination_fn, 606 sink_fn=self.sink_fn, 607 max_writers_per_bundle=self._max_num_writers_per_bundle)). 608 with_outputs( 609 _WriteUnshardedRecordsFn.SPILLED_RECORDS, 610 _WriteUnshardedRecordsFn.WRITTEN_FILES)) 611 612 written_files_pc = output[_WriteUnshardedRecordsFn.WRITTEN_FILES] 613 spilled_records_pc = output[_WriteUnshardedRecordsFn.SPILLED_RECORDS] 614 615 more_written_files_pc = ( 616 spilled_records_pc 617 | beam.ParDo( 618 _AppendShardedDestination(self.destination_fn, self.shards)) 619 | "GroupRecordsByDestinationAndShard" >> beam.GroupByKey() 620 | beam.ParDo( 621 _WriteShardedRecordsFn( 622 self._temp_directory, self.sink_fn, self.shards))) 623 624 files_by_destination_pc = ( 625 (written_files_pc, more_written_files_pc) 626 | beam.Flatten() 627 | beam.Map(lambda file_result: (file_result.destination, file_result)) 628 | "GroupTempFilesByDestination" >> beam.GroupByKey()) 629 630 # Now we should take the temporary files, and write them to the final 631 # destination, with their proper names. 632 633 file_results = ( 634 files_by_destination_pc 635 | beam.ParDo( 636 _MoveTempFilesIntoFinalDestinationFn( 637 self.path, self.file_naming_fn, self._temp_directory))) 638 639 return file_results 640 641 642 def _create_writer( 643 base_path, 644 writer_key: Tuple[str, IntervalWindow], 645 create_metadata_fn: CreateFileMetadataFn, 646 ): 647 try: 648 filesystems.FileSystems.mkdirs(base_path) 649 except IOError: 650 # Directory already exists. 651 pass 652 653 destination = writer_key[0] 654 655 # The file name has a prefix determined by destination+window, along with 656 # a random string. This allows us to retrieve orphaned files later on. 657 file_name = '%s_%s' % (abs(hash(writer_key)), uuid.uuid4()) 658 full_file_name = filesystems.FileSystems.join(base_path, file_name) 659 metadata = create_metadata_fn(destination, full_file_name) 660 return full_file_name, filesystems.FileSystems.create( 661 full_file_name, 662 **metadata._asdict()) 663 664 665 class _MoveTempFilesIntoFinalDestinationFn(beam.DoFn): 666 def __init__(self, path, file_naming_fn, temp_dir): 667 self.path = path 668 self.file_naming_fn = file_naming_fn 669 self.temporary_directory = temp_dir 670 671 def process(self, element, w=beam.DoFn.WindowParam): 672 destination = element[0] 673 # list of FileResult objects for temp files 674 temp_file_results = list(element[1]) 675 # list of FileResult objects for final files 676 final_file_results = [] 677 678 for i, r in enumerate(temp_file_results): 679 # TODO(pabloem): Handle compression for files. 680 final_file_name = self.file_naming_fn( 681 r.window, r.pane, i, len(temp_file_results), '', destination) 682 683 final_file_results.append( 684 FileResult( 685 final_file_name, 686 i, 687 len(temp_file_results), 688 r.window, 689 r.pane, 690 destination)) 691 692 move_from = [f.file_name for f in temp_file_results] 693 move_to = [f.file_name for f in final_file_results] 694 _LOGGER.info( 695 'Moving temporary files %s to dir: %s as %s', 696 map(os.path.basename, move_from), 697 self.path.get(), 698 move_to) 699 700 try: 701 filesystems.FileSystems.rename( 702 move_from, 703 [filesystems.FileSystems.join(self.path.get(), f) for f in move_to]) 704 except BeamIOError: 705 # This error is not serious, because it may happen on a retry of the 706 # bundle. We simply log it. 707 _LOGGER.debug( 708 'Exception occurred during moving files: %s. This may be due to a' 709 ' bundle being retried.', 710 move_from) 711 712 yield from final_file_results 713 714 _LOGGER.debug( 715 'Checking orphaned temporary files for destination %s and window %s', 716 destination, 717 w) 718 writer_key = (destination, w) 719 self._check_orphaned_files(writer_key) 720 721 def _check_orphaned_files(self, writer_key): 722 try: 723 prefix = filesystems.FileSystems.join( 724 self.temporary_directory.get(), str(abs(hash(writer_key)))) 725 match_result = filesystems.FileSystems.match(['%s*' % prefix]) 726 orphaned_files = [m.path for m in match_result[0].metadata_list] 727 728 if len(orphaned_files) > 0: 729 _LOGGER.info( 730 'Some files may be left orphaned in the temporary folder: %s', 731 orphaned_files) 732 except BeamIOError as e: 733 _LOGGER.info('Exceptions when checking orphaned files: %s', e) 734 735 736 class _WriteShardedRecordsFn(beam.DoFn): 737 738 def __init__(self, 739 base_path, 740 sink_fn, # type: Callable[[Any], FileSink] 741 shards # type: int 742 ): 743 self.base_path = base_path 744 self.sink_fn = sink_fn 745 self.shards = shards 746 747 def process( 748 self, element, w=beam.DoFn.WindowParam, pane=beam.DoFn.PaneInfoParam): 749 destination_and_shard = element[0] 750 destination = destination_and_shard[0] 751 shard = destination_and_shard[1] 752 records = element[1] 753 754 sink = self.sink_fn(destination) 755 756 full_file_name, writer = _create_writer( 757 base_path=self.base_path.get(), 758 writer_key=(destination, w), 759 create_metadata_fn=sink.create_metadata) 760 761 sink.open(writer) 762 763 for r in records: 764 sink.write(r) 765 766 sink.flush() 767 writer.close() 768 769 _LOGGER.info( 770 'Writing file %s for destination %s and shard %s', 771 full_file_name, 772 destination, 773 repr(shard)) 774 775 yield FileResult( 776 full_file_name, 777 shard_index=shard, 778 total_shards=self.shards, 779 window=w, 780 pane=pane, 781 destination=destination) 782 783 784 class _AppendShardedDestination(beam.DoFn): 785 def __init__( 786 self, 787 destination, # type: Callable[[Any], str] 788 shards # type: int 789 ): 790 self.destination_fn = destination 791 self.shards = shards 792 793 # We start the shards for a single destination at an arbitrary point. 794 self._shard_counter = collections.defaultdict( 795 lambda: random.randrange(self.shards)) # type: DefaultDict[str, int] 796 797 def _next_shard_for_destination(self, destination): 798 self._shard_counter[destination] = ((self._shard_counter[destination] + 1) % 799 self.shards) 800 801 return self._shard_counter[destination] 802 803 def process(self, record): 804 destination = self.destination_fn(record) 805 shard = self._next_shard_for_destination(destination) 806 807 yield ((destination, shard), record) 808 809 810 class _WriteUnshardedRecordsFn(beam.DoFn): 811 812 SPILLED_RECORDS = 'spilled_records' 813 WRITTEN_FILES = 'written_files' 814 815 _writers_and_sinks = None # type: Dict[Tuple[str, BoundedWindow], Tuple[BinaryIO, FileSink]] 816 _file_names = None # type: Dict[Tuple[str, BoundedWindow], str] 817 818 def __init__( 819 self, 820 base_path, 821 destination_fn, 822 sink_fn, 823 max_writers_per_bundle=WriteToFiles.MAX_NUM_WRITERS_PER_BUNDLE): 824 self.base_path = base_path 825 self.destination_fn = destination_fn 826 self.sink_fn = sink_fn 827 self.max_num_writers_per_bundle = max_writers_per_bundle 828 829 def start_bundle(self): 830 self._writers_and_sinks = {} 831 self._file_names = {} 832 833 def process( 834 self, record, w=beam.DoFn.WindowParam, pane=beam.DoFn.PaneInfoParam): 835 destination = self.destination_fn(record) 836 837 writer, sink = self._get_or_create_writer_and_sink(destination, w) 838 839 if not writer: 840 return [beam.pvalue.TaggedOutput(self.SPILLED_RECORDS, record)] 841 else: 842 sink.write(record) 843 844 def _get_or_create_writer_and_sink(self, destination, window): 845 """Returns a tuple of writer, sink.""" 846 writer_key = (destination, window) 847 if writer_key in self._writers_and_sinks: 848 return self._writers_and_sinks.get(writer_key) 849 elif len(self._writers_and_sinks) >= self.max_num_writers_per_bundle: 850 # The writer does not exist, and we have too many writers already. 851 return None, None 852 else: 853 # The writer does not exist, but we can still create a new one. 854 sink = self.sink_fn(destination) 855 856 full_file_name, writer = _create_writer( 857 base_path=self.base_path.get(), 858 writer_key=writer_key, 859 create_metadata_fn=sink.create_metadata) 860 861 sink.open(writer) 862 self._writers_and_sinks[writer_key] = (writer, sink) 863 self._file_names[writer_key] = full_file_name 864 return self._writers_and_sinks[writer_key] 865 866 def finish_bundle(self): 867 for key, (writer, sink) in self._writers_and_sinks.items(): 868 869 sink.flush() 870 writer.close() 871 872 file_result = FileResult(self._file_names[key], 873 shard_index=-1, 874 total_shards=0, 875 window=key[1], 876 pane=None, # TODO(pabloem): get the pane info 877 destination=key[0]) 878 879 yield beam.pvalue.TaggedOutput( 880 self.WRITTEN_FILES, 881 beam.transforms.window.WindowedValue( 882 file_result, 883 timestamp=key[1].start, 884 windows=[key[1]] # TODO(pabloem) HOW DO WE GET THE PANE 885 )) 886 887 888 class _RemoveDuplicates(beam.DoFn): 889 """Internal DoFn that filters out filenames already seen (even though the file 890 has updated).""" 891 COUNT_STATE = CombiningValueStateSpec('count', combine_fn=sum) 892 893 def process( 894 self, 895 element: Tuple[str, filesystem.FileMetadata], 896 count_state=beam.DoFn.StateParam(COUNT_STATE) 897 ) -> Iterable[filesystem.FileMetadata]: 898 899 path = element[0] 900 file_metadata = element[1] 901 counter = count_state.read() 902 903 if counter == 0: 904 count_state.add(1) 905 _LOGGER.debug('Generated entry for file %s', path) 906 yield file_metadata 907 else: 908 _LOGGER.debug('File %s was already read, seen %d times', path, counter) 909 910 911 class _RemoveOldDuplicates(beam.DoFn): 912 """Internal DoFn that filters out filenames already seen and timestamp 913 unchanged.""" 914 TIME_STATE = CombiningValueStateSpec( 915 'count', combine_fn=partial(max, default=0.0)) 916 917 def process( 918 self, 919 element: Tuple[str, filesystem.FileMetadata], 920 time_state=beam.DoFn.StateParam(TIME_STATE) 921 ) -> Iterable[filesystem.FileMetadata]: 922 path = element[0] 923 file_metadata = element[1] 924 new_ts = file_metadata.last_updated_in_seconds 925 old_ts = time_state.read() 926 927 if old_ts < new_ts: 928 time_state.add(new_ts) 929 _LOGGER.debug('Generated entry for file %s', path) 930 yield file_metadata 931 else: 932 _LOGGER.debug('File %s was already read', path)