github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/fileio_test.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/fileio_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Tests for transforms defined in apache_beam.io.fileio."""
    19  
    20  # pytype: skip-file
    21  
    22  import csv
    23  import io
    24  import json
    25  import logging
    26  import os
    27  import unittest
    28  import uuid
    29  import warnings
    30  
    31  import pytest
    32  from hamcrest.library.text import stringmatches
    33  
    34  import apache_beam as beam
    35  from apache_beam.io import fileio
    36  from apache_beam.io.filebasedsink_test import _TestCaseWithTempDirCleanUp
    37  from apache_beam.io.filesystem import CompressionTypes
    38  from apache_beam.io.filesystems import FileSystems
    39  from apache_beam.options.pipeline_options import PipelineOptions
    40  from apache_beam.options.pipeline_options import StandardOptions
    41  from apache_beam.testing.test_pipeline import TestPipeline
    42  from apache_beam.testing.test_stream import TestStream
    43  from apache_beam.testing.test_utils import compute_hash
    44  from apache_beam.testing.util import assert_that
    45  from apache_beam.testing.util import equal_to
    46  from apache_beam.testing.util import matches_all
    47  from apache_beam.transforms import trigger
    48  from apache_beam.transforms.window import FixedWindows
    49  from apache_beam.transforms.window import GlobalWindow
    50  from apache_beam.transforms.window import IntervalWindow
    51  from apache_beam.utils.timestamp import Timestamp
    52  
    53  warnings.filterwarnings(
    54      'ignore', category=FutureWarning, module='apache_beam.io.fileio_test')
    55  
    56  
    57  def _get_file_reader(readable_file):
    58    return io.TextIOWrapper(readable_file.open())
    59  
    60  
    61  class MatchTest(_TestCaseWithTempDirCleanUp):
    62    def test_basic_two_files(self):
    63      files = []
    64      tempdir = '%s%s' % (self._new_tempdir(), os.sep)
    65  
    66      # Create a couple files to be matched
    67      files.append(self._create_temp_file(dir=tempdir))
    68      files.append(self._create_temp_file(dir=tempdir))
    69  
    70      with TestPipeline() as p:
    71        files_pc = (
    72            p
    73            | fileio.MatchFiles(FileSystems.join(tempdir, '*'))
    74            | beam.Map(lambda x: x.path))
    75  
    76        assert_that(files_pc, equal_to(files))
    77  
    78    def test_match_all_two_directories(self):
    79      files = []
    80      directories = []
    81  
    82      for _ in range(2):
    83        # TODO: What about this having to append the ending slash?
    84        d = '%s%s' % (self._new_tempdir(), os.sep)
    85        directories.append(d)
    86  
    87        files.append(self._create_temp_file(dir=d))
    88        files.append(self._create_temp_file(dir=d))
    89  
    90      with TestPipeline() as p:
    91        files_pc = (
    92            p
    93            | beam.Create([FileSystems.join(d, '*') for d in directories])
    94            | fileio.MatchAll()
    95            | beam.Map(lambda x: x.path))
    96  
    97        assert_that(files_pc, equal_to(files))
    98  
    99    def test_match_files_one_directory_failure1(self):
   100      directories = [
   101          '%s%s' % (self._new_tempdir(), os.sep),
   102          '%s%s' % (self._new_tempdir(), os.sep)
   103      ]
   104  
   105      files = []
   106      files.append(self._create_temp_file(dir=directories[0]))
   107      files.append(self._create_temp_file(dir=directories[0]))
   108  
   109      with self.assertRaises(beam.io.filesystem.BeamIOError):
   110        with TestPipeline() as p:
   111          files_pc = (
   112              p
   113              | beam.Create([FileSystems.join(d, '*') for d in directories])
   114              | fileio.MatchAll(fileio.EmptyMatchTreatment.DISALLOW)
   115              | beam.Map(lambda x: x.path))
   116  
   117          assert_that(files_pc, equal_to(files))
   118  
   119    def test_match_files_one_directory_failure2(self):
   120      directories = [
   121          '%s%s' % (self._new_tempdir(), os.sep),
   122          '%s%s' % (self._new_tempdir(), os.sep)
   123      ]
   124  
   125      files = []
   126      files.append(self._create_temp_file(dir=directories[0]))
   127      files.append(self._create_temp_file(dir=directories[0]))
   128  
   129      with TestPipeline() as p:
   130        files_pc = (
   131            p
   132            | beam.Create([FileSystems.join(d, '*') for d in directories])
   133            | fileio.MatchAll(fileio.EmptyMatchTreatment.ALLOW_IF_WILDCARD)
   134            | beam.Map(lambda x: x.path))
   135  
   136        assert_that(files_pc, equal_to(files))
   137  
   138  
   139  class ReadTest(_TestCaseWithTempDirCleanUp):
   140    def test_basic_file_name_provided(self):
   141      content = 'TestingMyContent\nIn multiple lines\nhaha!'
   142      dir = '%s%s' % (self._new_tempdir(), os.sep)
   143      self._create_temp_file(dir=dir, content=content)
   144  
   145      with TestPipeline() as p:
   146        content_pc = (
   147            p
   148            | beam.Create([FileSystems.join(dir, '*')])
   149            | fileio.MatchAll()
   150            | fileio.ReadMatches()
   151            | beam.FlatMap(lambda f: f.read().decode('utf-8').splitlines()))
   152  
   153        assert_that(content_pc, equal_to(content.splitlines()))
   154  
   155    def test_csv_file_source(self):
   156      content = 'name,year,place\ngoogle,1999,CA\nspotify,2006,sweden'
   157      rows = [r.split(',') for r in content.split('\n')]
   158  
   159      dir = '%s%s' % (self._new_tempdir(), os.sep)
   160      self._create_temp_file(dir=dir, content=content)
   161  
   162      with TestPipeline() as p:
   163        content_pc = (
   164            p
   165            | beam.Create([FileSystems.join(dir, '*')])
   166            | fileio.MatchAll()
   167            | fileio.ReadMatches()
   168            | beam.FlatMap(lambda rf: csv.reader(_get_file_reader(rf))))
   169  
   170        assert_that(content_pc, equal_to(rows))
   171  
   172    def test_infer_compressed_file(self):
   173      dir = '%s%s' % (self._new_tempdir(), os.sep)
   174  
   175      file_contents = b'compressed_contents!'
   176      import gzip
   177      with gzip.GzipFile(os.path.join(dir, 'compressed.gz'), 'w') as f:
   178        f.write(file_contents)
   179  
   180      file_contents2 = b'compressed_contents_bz2!'
   181      import bz2
   182      with bz2.BZ2File(os.path.join(dir, 'compressed2.bz2'), 'w') as f:
   183        f.write(file_contents2)
   184  
   185      with TestPipeline() as p:
   186        content_pc = (
   187            p
   188            | beam.Create([FileSystems.join(dir, '*')])
   189            | fileio.MatchAll()
   190            | fileio.ReadMatches()
   191            | beam.Map(lambda rf: rf.open().readline()))
   192  
   193        assert_that(content_pc, equal_to([file_contents, file_contents2]))
   194  
   195    def test_read_bz2_compressed_file_without_suffix(self):
   196      dir = '%s%s' % (self._new_tempdir(), os.sep)
   197  
   198      file_contents = b'compressed_contents!'
   199      import bz2
   200      with bz2.BZ2File(os.path.join(dir, 'compressed'), 'w') as f:
   201        f.write(file_contents)
   202  
   203      with TestPipeline() as p:
   204        content_pc = (
   205            p
   206            | beam.Create([FileSystems.join(dir, '*')])
   207            | fileio.MatchAll()
   208            | fileio.ReadMatches()
   209            | beam.Map(
   210                lambda rf: rf.open(compression_type=CompressionTypes.BZIP2).read(
   211                    len(file_contents))))
   212  
   213        assert_that(content_pc, equal_to([file_contents]))
   214  
   215    def test_read_gzip_compressed_file_without_suffix(self):
   216      dir = '%s%s' % (self._new_tempdir(), os.sep)
   217  
   218      file_contents = b'compressed_contents!'
   219      import gzip
   220      with gzip.GzipFile(os.path.join(dir, 'compressed'), 'w') as f:
   221        f.write(file_contents)
   222  
   223      with TestPipeline() as p:
   224        content_pc = (
   225            p
   226            | beam.Create([FileSystems.join(dir, '*')])
   227            | fileio.MatchAll()
   228            | fileio.ReadMatches()
   229            | beam.Map(
   230                lambda rf: rf.open(compression_type=CompressionTypes.GZIP).read(
   231                    len(file_contents))))
   232  
   233        assert_that(content_pc, equal_to([file_contents]))
   234  
   235    def test_string_filenames_and_skip_directory(self):
   236      content = 'thecontent\n'
   237      files = []
   238      tempdir = '%s%s' % (self._new_tempdir(), os.sep)
   239  
   240      # Create a couple files to be matched
   241      files.append(self._create_temp_file(dir=tempdir, content=content))
   242      files.append(self._create_temp_file(dir=tempdir, content=content))
   243  
   244      with TestPipeline() as p:
   245        contents_pc = (
   246            p
   247            | beam.Create(files + ['%s/' % tempdir])
   248            | fileio.ReadMatches()
   249            | beam.FlatMap(lambda x: x.read().decode('utf-8').splitlines()))
   250  
   251        assert_that(contents_pc, equal_to(content.splitlines() * 2))
   252  
   253    def test_fail_on_directories(self):
   254      content = 'thecontent\n'
   255      files = []
   256      tempdir = '%s%s' % (self._new_tempdir(), os.sep)
   257  
   258      # Create a couple files to be matched
   259      files.append(self._create_temp_file(dir=tempdir, content=content))
   260      files.append(self._create_temp_file(dir=tempdir, content=content))
   261  
   262      with self.assertRaises(beam.io.filesystem.BeamIOError):
   263        with TestPipeline() as p:
   264          _ = (
   265              p
   266              | beam.Create(files + ['%s/' % tempdir])
   267              | fileio.ReadMatches(skip_directories=False)
   268              | beam.Map(lambda x: x.read_utf8()))
   269  
   270  
   271  class MatchIntegrationTest(unittest.TestCase):
   272  
   273    INPUT_FILE = 'gs://dataflow-samples/shakespeare/kinglear.txt'
   274    KINGLEAR_CHECKSUM = 'f418b25f1507f5a901257026b035ac2857a7ab87'
   275    INPUT_FILE_LARGE = (
   276        'gs://dataflow-samples/wikipedia_edits/wiki_data-00000000000*.json')
   277  
   278    WIKI_FILES = [
   279        'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000000.json',
   280        'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000001.json',
   281        'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000002.json',
   282        'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000003.json',
   283        'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000004.json',
   284        'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000005.json',
   285        'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000006.json',
   286        'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000007.json',
   287        'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000008.json',
   288        'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000009.json',
   289    ]
   290  
   291    def setUp(self):
   292      self.test_pipeline = TestPipeline(is_integration_test=True)
   293  
   294    @pytest.mark.it_postcommit
   295    def test_transform_on_gcs(self):
   296      args = self.test_pipeline.get_full_options_as_args()
   297  
   298      with beam.Pipeline(argv=args) as p:
   299        matches_pc = (
   300            p
   301            | beam.Create([self.INPUT_FILE, self.INPUT_FILE_LARGE])
   302            | fileio.MatchAll()
   303            | 'GetPath' >> beam.Map(lambda metadata: metadata.path))
   304  
   305        assert_that(
   306            matches_pc,
   307            equal_to([self.INPUT_FILE] + self.WIKI_FILES),
   308            label='Matched Files')
   309  
   310        checksum_pc = (
   311            p
   312            | 'SingleFile' >> beam.Create([self.INPUT_FILE])
   313            | 'MatchOneAll' >> fileio.MatchAll()
   314            | fileio.ReadMatches()
   315            | 'ReadIn' >> beam.Map(lambda x: x.read_utf8().split('\n'))
   316            | 'Checksums' >> beam.Map(compute_hash))
   317  
   318        assert_that(
   319            checksum_pc,
   320            equal_to([self.KINGLEAR_CHECKSUM]),
   321            label='Assert Checksums')
   322  
   323  
   324  class MatchContinuouslyTest(_TestCaseWithTempDirCleanUp):
   325    def test_with_deduplication(self):
   326      files = []
   327      tempdir = '%s%s' % (self._new_tempdir(), os.sep)
   328  
   329      # Create a file to be matched before pipeline
   330      files.append(self._create_temp_file(dir=tempdir))
   331      # Add file name that will be created mid-pipeline
   332      files.append(FileSystems.join(tempdir, 'extra'))
   333  
   334      interval = 0.2
   335      start = Timestamp.now()
   336      stop = start + interval + 0.1
   337  
   338      def _create_extra_file(element):
   339        writer = FileSystems.create(FileSystems.join(tempdir, 'extra'))
   340        writer.close()
   341        return element.path
   342  
   343      with TestPipeline() as p:
   344        match_continiously = (
   345            p
   346            | fileio.MatchContinuously(
   347                file_pattern=FileSystems.join(tempdir, '*'),
   348                interval=interval,
   349                start_timestamp=start,
   350                stop_timestamp=stop)
   351            | beam.Map(_create_extra_file))
   352  
   353        assert_that(match_continiously, equal_to(files))
   354  
   355    def test_without_deduplication(self):
   356      interval = 0.2
   357      start = Timestamp.now()
   358      stop = start + interval + 0.1
   359  
   360      files = []
   361      tempdir = '%s%s' % (self._new_tempdir(), os.sep)
   362  
   363      # Create a file to be matched before pipeline starts
   364      file = self._create_temp_file(dir=tempdir)
   365      # Add file twice, since it will be matched for every interval
   366      files += [file, file]
   367      # Add file name that will be created mid-pipeline
   368      files.append(FileSystems.join(tempdir, 'extra'))
   369  
   370      def _create_extra_file(element):
   371        writer = FileSystems.create(FileSystems.join(tempdir, 'extra'))
   372        writer.close()
   373        return element.path
   374  
   375      with TestPipeline() as p:
   376        match_continiously = (
   377            p
   378            | fileio.MatchContinuously(
   379                file_pattern=FileSystems.join(tempdir, '*'),
   380                interval=interval,
   381                has_deduplication=False,
   382                start_timestamp=start,
   383                stop_timestamp=stop)
   384            | beam.Map(_create_extra_file))
   385  
   386        assert_that(match_continiously, equal_to(files))
   387  
   388    def test_match_updated_files(self):
   389      files = []
   390      tempdir = '%s%s' % (self._new_tempdir(), os.sep)
   391  
   392      def _create_extra_file(element):
   393        writer = FileSystems.create(FileSystems.join(tempdir, 'extra'))
   394        writer.close()
   395        return element.path
   396  
   397      # Create two files to be matched before pipeline
   398      files.append(self._create_temp_file(dir=tempdir))
   399      writer = FileSystems.create(FileSystems.join(tempdir, 'extra'))
   400      writer.close()
   401  
   402      # Add file name that will be created mid-pipeline
   403      files.append(FileSystems.join(tempdir, 'extra'))
   404      files.append(FileSystems.join(tempdir, 'extra'))
   405  
   406      interval = 0.2
   407      start = Timestamp.now()
   408      stop = start + interval + 0.1
   409  
   410      with TestPipeline() as p:
   411        match_continiously = (
   412            p
   413            | fileio.MatchContinuously(
   414                file_pattern=FileSystems.join(tempdir, '*'),
   415                interval=interval,
   416                start_timestamp=start,
   417                stop_timestamp=stop,
   418                match_updated_files=True)
   419            | beam.Map(_create_extra_file))
   420  
   421        assert_that(match_continiously, equal_to(files))
   422  
   423  
   424  class WriteFilesTest(_TestCaseWithTempDirCleanUp):
   425  
   426    SIMPLE_COLLECTION = [
   427        {
   428            'project': 'beam', 'foundation': 'apache'
   429        },
   430        {
   431            'project': 'prometheus', 'foundation': 'cncf'
   432        },
   433        {
   434            'project': 'flink', 'foundation': 'apache'
   435        },
   436        {
   437            'project': 'grpc', 'foundation': 'cncf'
   438        },
   439        {
   440            'project': 'spark', 'foundation': 'apache'
   441        },
   442        {
   443            'project': 'kubernetes', 'foundation': 'cncf'
   444        },
   445        {
   446            'project': 'spark', 'foundation': 'apache'
   447        },
   448        {
   449            'project': 'knative', 'foundation': 'cncf'
   450        },
   451        {
   452            'project': 'linux', 'foundation': 'linux'
   453        },
   454    ]
   455  
   456    LARGER_COLLECTION = ['{:05d}'.format(i) for i in range(200)]
   457  
   458    CSV_HEADERS = ['project', 'foundation']
   459  
   460    SIMPLE_COLLECTION_VALIDATION_SET = {(elm['project'], elm['foundation'])
   461                                        for elm in SIMPLE_COLLECTION}
   462  
   463    class CsvSink(fileio.TextSink):
   464      def __init__(self, headers):
   465        self.headers = headers
   466  
   467      def write(self, record):
   468        self._fh.write(','.join([record[h] for h in self.headers]).encode('utf8'))
   469        self._fh.write('\n'.encode('utf8'))
   470  
   471    class JsonSink(fileio.TextSink):
   472      def write(self, record):
   473        self._fh.write(json.dumps(record).encode('utf8'))
   474        self._fh.write('\n'.encode('utf8'))
   475  
   476    def test_write_to_single_file_batch(self):
   477  
   478      dir = self._new_tempdir()
   479  
   480      with TestPipeline() as p:
   481        _ = (
   482            p
   483            | beam.Create(WriteFilesTest.SIMPLE_COLLECTION)
   484            | "Serialize" >> beam.Map(json.dumps)
   485            | beam.io.fileio.WriteToFiles(path=dir))
   486  
   487      with TestPipeline() as p:
   488        result = (
   489            p
   490            | fileio.MatchFiles(FileSystems.join(dir, '*'))
   491            | fileio.ReadMatches()
   492            | beam.FlatMap(lambda f: f.read_utf8().strip().split('\n'))
   493            | beam.Map(json.loads))
   494  
   495        assert_that(result, equal_to([row for row in self.SIMPLE_COLLECTION]))
   496  
   497    def test_write_to_dynamic_destination(self):
   498  
   499      sink_params = [
   500          fileio.TextSink, # pass a type signature
   501          fileio.TextSink() # pass a FileSink object
   502      ]
   503  
   504      for sink in sink_params:
   505        dir = self._new_tempdir()
   506  
   507        with TestPipeline() as p:
   508          _ = (
   509              p
   510              | "Create" >> beam.Create(range(100))
   511              | beam.Map(lambda x: str(x))
   512              | fileio.WriteToFiles(
   513                  path=dir,
   514                  destination=lambda n: "odd" if int(n) % 2 else "even",
   515                  sink=sink,
   516                  file_naming=fileio.destination_prefix_naming("test")))
   517  
   518        with TestPipeline() as p:
   519          result = (
   520              p
   521              | fileio.MatchFiles(FileSystems.join(dir, '*'))
   522              | fileio.ReadMatches()
   523              | beam.Map(
   524                  lambda f: (
   525                      os.path.basename(f.metadata.path).split('-')[0],
   526                      sorted(map(int, f.read_utf8().strip().split('\n'))))))
   527  
   528          assert_that(
   529              result,
   530              equal_to([('odd', list(range(1, 100, 2))),
   531                        ('even', list(range(0, 100, 2)))]))
   532  
   533    def test_write_to_different_file_types_some_spilling(self):
   534  
   535      dir = self._new_tempdir()
   536  
   537      with TestPipeline() as p:
   538        _ = (
   539            p
   540            | beam.Create(WriteFilesTest.SIMPLE_COLLECTION)
   541            | beam.io.fileio.WriteToFiles(
   542                path=dir,
   543                destination=lambda record: record['foundation'],
   544                sink=lambda dest: (
   545                    WriteFilesTest.CsvSink(WriteFilesTest.CSV_HEADERS)
   546                    if dest == 'apache' else WriteFilesTest.JsonSink()),
   547                file_naming=fileio.destination_prefix_naming(),
   548                max_writers_per_bundle=1))
   549  
   550      with TestPipeline() as p:
   551        cncf_res = (
   552            p
   553            | fileio.MatchFiles(FileSystems.join(dir, 'cncf*'))
   554            | fileio.ReadMatches()
   555            | beam.FlatMap(lambda f: f.read_utf8().strip().split('\n'))
   556            | beam.Map(json.loads))
   557  
   558        apache_res = (
   559            p
   560            |
   561            "MatchApache" >> fileio.MatchFiles(FileSystems.join(dir, 'apache*'))
   562            | "ReadApache" >> fileio.ReadMatches()
   563            | "MapApache" >>
   564            beam.FlatMap(lambda rf: csv.reader(_get_file_reader(rf))))
   565  
   566        assert_that(
   567            cncf_res,
   568            equal_to([
   569                row for row in self.SIMPLE_COLLECTION
   570                if row['foundation'] == 'cncf'
   571            ]),
   572            label='verifyCNCF')
   573  
   574        assert_that(
   575            apache_res,
   576            equal_to([[row['project'], row['foundation']]
   577                      for row in self.SIMPLE_COLLECTION
   578                      if row['foundation'] == 'apache']),
   579            label='verifyApache')
   580  
   581    @unittest.skip('https://github.com/apache/beam/issues/21269')
   582    def test_find_orphaned_files(self):
   583      dir = self._new_tempdir()
   584  
   585      write_transform = beam.io.fileio.WriteToFiles(path=dir)
   586  
   587      def write_orphaned_file(temp_dir, writer_key):
   588        temp_dir_path = FileSystems.join(dir, temp_dir)
   589  
   590        file_prefix_dir = FileSystems.join(
   591            temp_dir_path, str(abs(hash(writer_key))))
   592  
   593        file_name = '%s_%s' % (file_prefix_dir, uuid.uuid4())
   594        with FileSystems.create(file_name) as f:
   595          f.write(b'Hello y\'all')
   596  
   597        return file_name
   598  
   599      with TestPipeline() as p:
   600        _ = (
   601            p
   602            | beam.Create(WriteFilesTest.SIMPLE_COLLECTION)
   603            | "Serialize" >> beam.Map(json.dumps)
   604            | write_transform)
   605  
   606        # Pre-create the temp directory.
   607        temp_dir_path = FileSystems.mkdirs(
   608            FileSystems.join(dir, write_transform._temp_directory.get()))
   609        write_orphaned_file(
   610            write_transform._temp_directory.get(), (None, GlobalWindow()))
   611        f2 = write_orphaned_file(
   612            write_transform._temp_directory.get(), ('other-dest', GlobalWindow()))
   613  
   614      temp_dir_path = FileSystems.join(dir, write_transform._temp_directory.get())
   615      leftovers = FileSystems.match(['%s%s*' % (temp_dir_path, os.sep)])
   616      found_files = [m.path for m in leftovers[0].metadata_list]
   617      self.assertListEqual(found_files, [f2])
   618  
   619    def test_write_to_different_file_types(self):
   620  
   621      dir = self._new_tempdir()
   622  
   623      with TestPipeline() as p:
   624        _ = (
   625            p
   626            | beam.Create(WriteFilesTest.SIMPLE_COLLECTION)
   627            | beam.io.fileio.WriteToFiles(
   628                path=dir,
   629                destination=lambda record: record['foundation'],
   630                sink=lambda dest: (
   631                    WriteFilesTest.CsvSink(WriteFilesTest.CSV_HEADERS)
   632                    if dest == 'apache' else WriteFilesTest.JsonSink()),
   633                file_naming=fileio.destination_prefix_naming()))
   634  
   635      with TestPipeline() as p:
   636        cncf_res = (
   637            p
   638            | fileio.MatchFiles(FileSystems.join(dir, 'cncf*'))
   639            | fileio.ReadMatches()
   640            | beam.FlatMap(lambda f: f.read_utf8().strip().split('\n'))
   641            | beam.Map(json.loads))
   642  
   643        apache_res = (
   644            p
   645            |
   646            "MatchApache" >> fileio.MatchFiles(FileSystems.join(dir, 'apache*'))
   647            | "ReadApache" >> fileio.ReadMatches()
   648            | "MapApache" >>
   649            beam.FlatMap(lambda rf: csv.reader(_get_file_reader(rf))))
   650  
   651        assert_that(
   652            cncf_res,
   653            equal_to([
   654                row for row in self.SIMPLE_COLLECTION
   655                if row['foundation'] == 'cncf'
   656            ]),
   657            label='verifyCNCF')
   658  
   659        assert_that(
   660            apache_res,
   661            equal_to([[row['project'], row['foundation']]
   662                      for row in self.SIMPLE_COLLECTION
   663                      if row['foundation'] == 'apache']),
   664            label='verifyApache')
   665  
   666    def record_dofn(self):
   667      class RecordDoFn(beam.DoFn):
   668        def process(self, element):
   669          WriteFilesTest.all_records.append(element)
   670  
   671      return RecordDoFn()
   672  
   673    def test_streaming_complex_timing(self):
   674      # Use state on the TestCase class, since other references would be pickled
   675      # into a closure and not have the desired side effects.
   676      #
   677      # TODO(https://github.com/apache/beam/issues/18987): Use assert_that after
   678      # it works for the cases here in streaming mode.
   679      WriteFilesTest.all_records = []
   680  
   681      dir = '%s%s' % (self._new_tempdir(), os.sep)
   682  
   683      # Setting up the input (TestStream)
   684      ts = TestStream().advance_watermark_to(0)
   685      for elm in WriteFilesTest.LARGER_COLLECTION:
   686        timestamp = int(elm)
   687  
   688        ts.add_elements([('key', '%s' % elm)])
   689        if timestamp % 5 == 0 and timestamp != 0:
   690          # TODO(https://github.com/apache/beam/issues/18721): Add many firings
   691          # per window after getting PaneInfo.
   692          ts.advance_processing_time(5)
   693          ts.advance_watermark_to(timestamp)
   694      ts.advance_watermark_to_infinity()
   695  
   696      def no_colon_file_naming(*args):
   697        file_name = fileio.destination_prefix_naming()(*args)
   698        return file_name.replace(':', '_')
   699  
   700      # The pipeline that we are testing
   701      options = PipelineOptions()
   702      options.view_as(StandardOptions).streaming = True
   703      with TestPipeline(options=options) as p:
   704        res = (
   705            p
   706            | ts
   707            | beam.WindowInto(
   708                FixedWindows(10),
   709                trigger=trigger.AfterWatermark(),
   710                accumulation_mode=trigger.AccumulationMode.DISCARDING)
   711            | beam.GroupByKey()
   712            | beam.FlatMap(lambda x: x[1]))
   713        # Triggering after 5 processing-time seconds, and on the watermark. Also
   714        # discarding old elements.
   715  
   716        _ = (
   717            res
   718            | beam.io.fileio.WriteToFiles(
   719                path=dir,
   720                file_naming=no_colon_file_naming,
   721                max_writers_per_bundle=0)
   722            | beam.Map(lambda fr: FileSystems.join(dir, fr.file_name))
   723            | beam.ParDo(self.record_dofn()))
   724  
   725      # Verification pipeline
   726      with TestPipeline() as p:
   727        files = (p | beam.io.fileio.MatchFiles(FileSystems.join(dir, '*')))
   728  
   729        file_names = (files | beam.Map(lambda fm: fm.path))
   730  
   731        file_contents = (
   732            files
   733            | beam.io.fileio.ReadMatches()
   734            | beam.Map(
   735                lambda rf: (rf.metadata.path, rf.read_utf8().strip().split('\n')))
   736        )
   737  
   738        content = (
   739            file_contents
   740            | beam.FlatMap(lambda fc: [ln.strip() for ln in fc[1]]))
   741  
   742        assert_that(
   743            file_names,
   744            equal_to(WriteFilesTest.all_records),
   745            label='AssertFilesMatch')
   746        assert_that(
   747            content,
   748            matches_all(WriteFilesTest.LARGER_COLLECTION),
   749            label='AssertContentsMatch')
   750  
   751    def test_streaming_different_file_types(self):
   752      dir = self._new_tempdir()
   753      input = iter(WriteFilesTest.SIMPLE_COLLECTION)
   754      ts = (
   755          TestStream().advance_watermark_to(0).add_elements(
   756              [next(input), next(input)]).advance_watermark_to(10).add_elements(
   757                  [next(input),
   758                   next(input)]).advance_watermark_to(20).add_elements([
   759                       next(input), next(input)
   760                   ]).advance_watermark_to(30).add_elements([
   761                       next(input), next(input)
   762                   ]).advance_watermark_to(40).advance_watermark_to_infinity())
   763  
   764      def no_colon_file_naming(*args):
   765        file_name = fileio.destination_prefix_naming()(*args)
   766        return file_name.replace(':', '_')
   767  
   768      with TestPipeline() as p:
   769        _ = (
   770            p
   771            | ts
   772            | beam.WindowInto(FixedWindows(10))
   773            | beam.io.fileio.WriteToFiles(
   774                path=dir,
   775                destination=lambda record: record['foundation'],
   776                sink=lambda dest: (
   777                    WriteFilesTest.CsvSink(WriteFilesTest.CSV_HEADERS)
   778                    if dest == 'apache' else WriteFilesTest.JsonSink()),
   779                file_naming=no_colon_file_naming,
   780                max_writers_per_bundle=0,
   781            ))
   782  
   783      with TestPipeline() as p:
   784        cncf_files = (
   785            p
   786            | fileio.MatchFiles(FileSystems.join(dir, 'cncf*'))
   787            | "CncfFileNames" >> beam.Map(lambda fm: fm.path))
   788  
   789        apache_files = (
   790            p
   791            |
   792            "MatchApache" >> fileio.MatchFiles(FileSystems.join(dir, 'apache*'))
   793            | "ApacheFileNames" >> beam.Map(lambda fm: fm.path))
   794  
   795        assert_that(
   796            cncf_files,
   797            matches_all([
   798                stringmatches.matches_regexp(
   799                    '.*cncf-1970-01-01T00_00_00-1970-01-01T00_00_10.*'),
   800                stringmatches.matches_regexp(
   801                    '.*cncf-1970-01-01T00_00_10-1970-01-01T00_00_20.*'),
   802                stringmatches.matches_regexp(
   803                    '.*cncf-1970-01-01T00_00_20-1970-01-01T00_00_30.*'),
   804                stringmatches.matches_regexp(
   805                    '.*cncf-1970-01-01T00_00_30-1970-01-01T00_00_40.*')
   806            ]),
   807            label='verifyCNCFFiles')
   808  
   809        assert_that(
   810            apache_files,
   811            matches_all([
   812                stringmatches.matches_regexp(
   813                    '.*apache-1970-01-01T00_00_00-1970-01-01T00_00_10.*'),
   814                stringmatches.matches_regexp(
   815                    '.*apache-1970-01-01T00_00_10-1970-01-01T00_00_20.*'),
   816                stringmatches.matches_regexp(
   817                    '.*apache-1970-01-01T00_00_20-1970-01-01T00_00_30.*'),
   818                stringmatches.matches_regexp(
   819                    '.*apache-1970-01-01T00_00_30-1970-01-01T00_00_40.*')
   820            ]),
   821            label='verifyApacheFiles')
   822  
   823    def test_shard_naming(self):
   824      namer = fileio.default_file_naming(prefix='/path/to/file', suffix='.txt')
   825      self.assertEqual(
   826          namer(GlobalWindow(), None, None, None, None, None),
   827          '/path/to/file.txt')
   828      self.assertEqual(
   829          namer(GlobalWindow(), None, 1, 5, None, None),
   830          '/path/to/file-00001-of-00005.txt')
   831      self.assertEqual(
   832          namer(GlobalWindow(), None, 1, 5, 'gz', None),
   833          '/path/to/file-00001-of-00005.txt.gz')
   834      self.assertEqual(
   835          namer(IntervalWindow(0, 100), None, 1, 5, None, None),
   836          '/path/to/file'
   837          '-1970-01-01T00:00:00-1970-01-01T00:01:40-00001-of-00005.txt')
   838  
   839  
   840  if __name__ == '__main__':
   841    logging.getLogger().setLevel(logging.INFO)
   842    unittest.main()