github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/testing/benchmarks/nexmark/queries/query10.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """ 19 Query 10, 'Log to sharded files' (Not in original suite.) 20 21 Every window_size_sec, save all events from the last period into 22 2*max_workers log files. 23 """ 24 25 import apache_beam as beam 26 from apache_beam.options.pipeline_options import GoogleCloudOptions 27 from apache_beam.transforms import trigger 28 from apache_beam.transforms import window 29 from apache_beam.utils.timestamp import Duration 30 31 NUM_SHARD_PER_WORKER = 5 32 LATE_BATCHING_PERIOD = 10 33 34 output_path = None 35 max_num_workers = 5 36 37 num_log_shards = NUM_SHARD_PER_WORKER * max_num_workers 38 39 40 class OutputFile(object): 41 def __init__(self, max_timestamp, shard, index, timing, filename): 42 self.max_timestamp = max_timestamp 43 self.shard = shard 44 self.index = index 45 self.timing = timing 46 self.filename = filename 47 48 49 def open_writable_gcs_file(options, filename): 50 # TODO: [https://github.com/apache/beam/issues/20670] it seems that beam team 51 # has not yet decided about this method and it is left blank and 52 # unspecified. 53 pass 54 55 56 def output_file_for(window, shard, pane): 57 """ 58 Returns: 59 an OutputFile object constructed with pane, window and shard. 60 """ 61 filename = '%s/LOG-%s-%s-%03d-%s' % ( 62 output_path, window.max_timestamp(), shard, pane.index, 63 pane.timing) if output_path else None 64 return OutputFile( 65 window.max_timestamp(), shard, pane.index, pane.timing, filename) 66 67 68 def index_path_for(window): 69 """ 70 Returns: 71 path to the index file containing all shard names or None if no output_path 72 is set 73 """ 74 if output_path: 75 return '%s/INDEX-%s' % (output_path, window.max_timestamp()) 76 else: 77 return None 78 79 80 def load(events, metadata=None, pipeline_options=None): 81 return ( 82 events 83 | 'query10_shard_events' >> beam.ParDo(ShardEventsDoFn()) 84 # trigger fires when each sub-triger (executed in order) fires 85 # repeatedly 1. after at least maxLogEvents in pane 86 # 2. or finally when watermark pass the end of window 87 # Repeatedly 1. after at least maxLogEvents in pane 88 # 2. or processing time pass the first element in pane + delay 89 | 'query10_fix_window' >> beam.WindowInto( 90 window.FixedWindows(metadata.get('window_size_sec')), 91 trigger=trigger.AfterEach( 92 trigger.OrFinally( 93 trigger.Repeatedly( 94 trigger.AfterCount(metadata.get('max_log_events'))), 95 trigger.AfterWatermark()), 96 trigger.Repeatedly( 97 trigger.AfterAny( 98 trigger.AfterCount(metadata.get('max_log_events')), 99 trigger.AfterProcessingTime(LATE_BATCHING_PERIOD)))), 100 accumulation_mode=trigger.AccumulationMode.DISCARDING, 101 # Use a 1 day allowed lateness so that any forgotten hold will stall 102 # the pipeline for that period and be very noticeable. 103 allowed_lateness=Duration.of(1 * 24 * 60 * 60)) 104 | 'query10_gbk' >> beam.GroupByKey() 105 | 'query10_write_event' >> beam.ParDo(WriteEventDoFn(), pipeline_options) 106 | 'query10_window_log_files' >> beam.WindowInto( 107 window.FixedWindows(metadata.get('window_size_sec')), 108 accumulation_mode=trigger.AccumulationMode.DISCARDING, 109 allowed_lateness=Duration.of(1 * 24 * 60 * 60)) 110 | 'query10_gbk_2' >> beam.GroupByKey() 111 | 'query10_write_index' >> beam.ParDo(WriteIndexDoFn(), pipeline_options)) 112 113 114 class ShardEventsDoFn(beam.DoFn): 115 def process(self, element): 116 shard_number = abs(hash(element) % num_log_shards) 117 shard = 'shard-%05d-of-%05d' % (shard_number, num_log_shards) 118 yield shard, element 119 120 121 class WriteEventDoFn(beam.DoFn): 122 def process( 123 self, 124 element, 125 pipeline_options, 126 window=beam.DoFn.WindowParam, 127 pane_info=beam.DoFn.PaneInfoParam): 128 shard = element[0] 129 options = pipeline_options.view_as(GoogleCloudOptions) 130 output_file = output_file_for(window, shard, pane_info) 131 if output_file.filename: 132 # not do anything because open_writable_gcs_file does not do anything 133 open_writable_gcs_file(options, output_file.filename) 134 for event in element[1]: # pylint: disable=unused-variable 135 # write to file 136 pass 137 yield None, output_file 138 139 140 class WriteIndexDoFn(beam.DoFn): 141 def process(self, element, pipeline_options, window=beam.DoFn.WindowParam): 142 options = pipeline_options.view_as(GoogleCloudOptions) 143 filename = index_path_for(window) 144 if filename: 145 # not do anything because open_writable_gcs_file does not do anything 146 open_writable_gcs_file(options, filename) 147 for output_file in element[1]: # pylint: disable=unused-variable 148 # write to file 149 pass