github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/testing/benchmarks/nexmark/queries/query10.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """
    19  Query 10, 'Log to sharded files' (Not in original suite.)
    20  
    21  Every window_size_sec, save all events from the last period into
    22  2*max_workers log files.
    23  """
    24  
    25  import apache_beam as beam
    26  from apache_beam.options.pipeline_options import GoogleCloudOptions
    27  from apache_beam.transforms import trigger
    28  from apache_beam.transforms import window
    29  from apache_beam.utils.timestamp import Duration
    30  
    31  NUM_SHARD_PER_WORKER = 5
    32  LATE_BATCHING_PERIOD = 10
    33  
    34  output_path = None
    35  max_num_workers = 5
    36  
    37  num_log_shards = NUM_SHARD_PER_WORKER * max_num_workers
    38  
    39  
    40  class OutputFile(object):
    41    def __init__(self, max_timestamp, shard, index, timing, filename):
    42      self.max_timestamp = max_timestamp
    43      self.shard = shard
    44      self.index = index
    45      self.timing = timing
    46      self.filename = filename
    47  
    48  
    49  def open_writable_gcs_file(options, filename):
    50    # TODO: [https://github.com/apache/beam/issues/20670] it seems that beam team
    51    #   has not yet decided about this method and it is left blank and
    52    #   unspecified.
    53    pass
    54  
    55  
    56  def output_file_for(window, shard, pane):
    57    """
    58    Returns:
    59      an OutputFile object constructed with pane, window and shard.
    60    """
    61    filename = '%s/LOG-%s-%s-%03d-%s' % (
    62        output_path, window.max_timestamp(), shard, pane.index,
    63        pane.timing) if output_path else None
    64    return OutputFile(
    65        window.max_timestamp(), shard, pane.index, pane.timing, filename)
    66  
    67  
    68  def index_path_for(window):
    69    """
    70    Returns:
    71      path to the index file containing all shard names or None if no output_path
    72        is set
    73    """
    74    if output_path:
    75      return '%s/INDEX-%s' % (output_path, window.max_timestamp())
    76    else:
    77      return None
    78  
    79  
    80  def load(events, metadata=None, pipeline_options=None):
    81    return (
    82        events
    83        | 'query10_shard_events' >> beam.ParDo(ShardEventsDoFn())
    84        # trigger fires when each sub-triger (executed in order) fires
    85        # repeatedly 1. after at least maxLogEvents in pane
    86        #            2. or finally when watermark pass the end of window
    87        # Repeatedly 1. after at least maxLogEvents in pane
    88        #            2. or processing time pass the first element in pane + delay
    89        | 'query10_fix_window' >> beam.WindowInto(
    90            window.FixedWindows(metadata.get('window_size_sec')),
    91            trigger=trigger.AfterEach(
    92                trigger.OrFinally(
    93                    trigger.Repeatedly(
    94                        trigger.AfterCount(metadata.get('max_log_events'))),
    95                    trigger.AfterWatermark()),
    96                trigger.Repeatedly(
    97                    trigger.AfterAny(
    98                        trigger.AfterCount(metadata.get('max_log_events')),
    99                        trigger.AfterProcessingTime(LATE_BATCHING_PERIOD)))),
   100            accumulation_mode=trigger.AccumulationMode.DISCARDING,
   101            # Use a 1 day allowed lateness so that any forgotten hold will stall
   102            # the pipeline for that period and be very noticeable.
   103            allowed_lateness=Duration.of(1 * 24 * 60 * 60))
   104        | 'query10_gbk' >> beam.GroupByKey()
   105        | 'query10_write_event' >> beam.ParDo(WriteEventDoFn(), pipeline_options)
   106        | 'query10_window_log_files' >> beam.WindowInto(
   107            window.FixedWindows(metadata.get('window_size_sec')),
   108            accumulation_mode=trigger.AccumulationMode.DISCARDING,
   109            allowed_lateness=Duration.of(1 * 24 * 60 * 60))
   110        | 'query10_gbk_2' >> beam.GroupByKey()
   111        | 'query10_write_index' >> beam.ParDo(WriteIndexDoFn(), pipeline_options))
   112  
   113  
   114  class ShardEventsDoFn(beam.DoFn):
   115    def process(self, element):
   116      shard_number = abs(hash(element) % num_log_shards)
   117      shard = 'shard-%05d-of-%05d' % (shard_number, num_log_shards)
   118      yield shard, element
   119  
   120  
   121  class WriteEventDoFn(beam.DoFn):
   122    def process(
   123        self,
   124        element,
   125        pipeline_options,
   126        window=beam.DoFn.WindowParam,
   127        pane_info=beam.DoFn.PaneInfoParam):
   128      shard = element[0]
   129      options = pipeline_options.view_as(GoogleCloudOptions)
   130      output_file = output_file_for(window, shard, pane_info)
   131      if output_file.filename:
   132        # not do anything because open_writable_gcs_file does not do anything
   133        open_writable_gcs_file(options, output_file.filename)
   134        for event in element[1]:  # pylint: disable=unused-variable
   135          # write to file
   136          pass
   137      yield None, output_file
   138  
   139  
   140  class WriteIndexDoFn(beam.DoFn):
   141    def process(self, element, pipeline_options, window=beam.DoFn.WindowParam):
   142      options = pipeline_options.view_as(GoogleCloudOptions)
   143      filename = index_path_for(window)
   144      if filename:
   145        # not do anything because open_writable_gcs_file does not do anything
   146        open_writable_gcs_file(options, filename)
   147        for output_file in element[1]:  # pylint: disable=unused-variable
   148          # write to file
   149          pass