github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/snippets/transforms/aggregation/combinevalues.py (about) 1 # coding=utf-8 2 # 3 # Licensed to the Apache Software Foundation (ASF) under one or more 4 # contributor license agreements. See the NOTICE file distributed with 5 # this work for additional information regarding copyright ownership. 6 # The ASF licenses this file to You under the Apache License, Version 2.0 7 # (the "License"); you may not use this file except in compliance with 8 # the License. You may obtain a copy of the License at 9 # 10 # http://www.apache.org/licenses/LICENSE-2.0 11 # 12 # Unless required by applicable law or agreed to in writing, software 13 # distributed under the License is distributed on an "AS IS" BASIS, 14 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 # See the License for the specific language governing permissions and 16 # limitations under the License. 17 # 18 19 # pytype: skip-file 20 21 22 def combinevalues_simple(test=None): 23 # [START combinevalues_simple] 24 import apache_beam as beam 25 26 with beam.Pipeline() as pipeline: 27 total = ( 28 pipeline 29 | 'Create produce counts' >> beam.Create([ 30 ('🥕', [3, 2]), 31 ('🍆', [1]), 32 ('🍅', [4, 5, 3]), 33 ]) 34 | 'Sum' >> beam.CombineValues(sum) 35 | beam.Map(print)) 36 # [END combinevalues_simple] 37 if test: 38 test(total) 39 40 41 def combinevalues_function(test=None): 42 # [START combinevalues_function] 43 import apache_beam as beam 44 45 def saturated_sum(values): 46 max_value = 8 47 return min(sum(values), max_value) 48 49 with beam.Pipeline() as pipeline: 50 saturated_total = ( 51 pipeline 52 | 'Create plant counts' >> beam.Create([ 53 ('🥕', [3, 2]), 54 ('🍆', [1]), 55 ('🍅', [4, 5, 3]), 56 ]) 57 | 'Saturated sum' >> beam.CombineValues(saturated_sum) 58 | beam.Map(print)) 59 # [END combinevalues_function] 60 if test: 61 test(saturated_total) 62 63 64 def combinevalues_lambda(test=None): 65 # [START combinevalues_lambda] 66 import apache_beam as beam 67 68 with beam.Pipeline() as pipeline: 69 saturated_total = ( 70 pipeline 71 | 'Create plant counts' >> beam.Create([ 72 ('🥕', [3, 2]), 73 ('🍆', [1]), 74 ('🍅', [4, 5, 3]), 75 ]) 76 | 'Saturated sum' >> 77 beam.CombineValues(lambda values: min(sum(values), 8)) 78 | beam.Map(print)) 79 # [END combinevalues_lambda] 80 if test: 81 test(saturated_total) 82 83 84 def combinevalues_multiple_arguments(test=None): 85 # [START combinevalues_multiple_arguments] 86 import apache_beam as beam 87 88 with beam.Pipeline() as pipeline: 89 saturated_total = ( 90 pipeline 91 | 'Create plant counts' >> beam.Create([ 92 ('🥕', [3, 2]), 93 ('🍆', [1]), 94 ('🍅', [4, 5, 3]), 95 ]) 96 | 'Saturated sum' >> beam.CombineValues( 97 lambda values, max_value: min(sum(values), max_value), max_value=8) 98 | beam.Map(print)) 99 # [END combinevalues_multiple_arguments] 100 if test: 101 test(saturated_total) 102 103 104 def combinevalues_side_inputs_singleton(test=None): 105 # [START combinevalues_side_inputs_singleton] 106 import apache_beam as beam 107 108 with beam.Pipeline() as pipeline: 109 max_value = pipeline | 'Create max_value' >> beam.Create([8]) 110 111 saturated_total = ( 112 pipeline 113 | 'Create plant counts' >> beam.Create([ 114 ('🥕', [3, 2]), 115 ('🍆', [1]), 116 ('🍅', [4, 5, 3]), 117 ]) 118 | 'Saturated sum' >> beam.CombineValues( 119 lambda values, 120 max_value: min(sum(values), max_value), 121 max_value=beam.pvalue.AsSingleton(max_value)) 122 | beam.Map(print)) 123 # [END combinevalues_side_inputs_singleton] 124 if test: 125 test(saturated_total) 126 127 128 def combinevalues_side_inputs_iter(test=None): 129 # [START combinevalues_side_inputs_iter] 130 import apache_beam as beam 131 132 def bounded_sum(values, data_range): 133 min_value = min(data_range) 134 result = sum(values) 135 if result < min_value: 136 return min_value 137 max_value = max(data_range) 138 if result > max_value: 139 return max_value 140 return result 141 142 with beam.Pipeline() as pipeline: 143 data_range = pipeline | 'Create data_range' >> beam.Create([2, 4, 8]) 144 145 bounded_total = ( 146 pipeline 147 | 'Create plant counts' >> beam.Create([ 148 ('🥕', [3, 2]), 149 ('🍆', [1]), 150 ('🍅', [4, 5, 3]), 151 ]) 152 | 'Bounded sum' >> beam.CombineValues( 153 bounded_sum, data_range=beam.pvalue.AsIter(data_range)) 154 | beam.Map(print)) 155 # [END combinevalues_side_inputs_iter] 156 if test: 157 test(bounded_total) 158 159 160 def combinevalues_side_inputs_dict(test=None): 161 # [START combinevalues_side_inputs_dict] 162 import apache_beam as beam 163 164 def bounded_sum(values, data_range): 165 min_value = data_range['min'] 166 result = sum(values) 167 if result < min_value: 168 return min_value 169 max_value = data_range['max'] 170 if result > max_value: 171 return max_value 172 return result 173 174 with beam.Pipeline() as pipeline: 175 data_range = pipeline | 'Create data_range' >> beam.Create([ 176 ('min', 2), 177 ('max', 8), 178 ]) 179 180 bounded_total = ( 181 pipeline 182 | 'Create plant counts' >> beam.Create([ 183 ('🥕', [3, 2]), 184 ('🍆', [1]), 185 ('🍅', [4, 5, 3]), 186 ]) 187 | 'Bounded sum' >> beam.CombineValues( 188 bounded_sum, data_range=beam.pvalue.AsDict(data_range)) 189 | beam.Map(print)) 190 # [END combinevalues_side_inputs_dict] 191 if test: 192 test(bounded_total) 193 194 195 def combinevalues_combinefn(test=None): 196 # [START combinevalues_combinefn] 197 import apache_beam as beam 198 199 class AverageFn(beam.CombineFn): 200 def create_accumulator(self): 201 return {} 202 203 def add_input(self, accumulator, input): 204 # accumulator == {} 205 # input == '🥕' 206 if input not in accumulator: 207 accumulator[input] = 0 # {'🥕': 0} 208 accumulator[input] += 1 # {'🥕': 1} 209 return accumulator 210 211 def merge_accumulators(self, accumulators): 212 # accumulators == [ 213 # {'🥕': 1, '🍅': 1}, 214 # {'🥕': 1, '🍅': 1, '🍆': 1}, 215 # ] 216 merged = {} 217 for accum in accumulators: 218 for item, count in accum.items(): 219 if item not in merged: 220 merged[item] = 0 221 merged[item] += count 222 # merged == {'🥕': 2, '🍅': 2, '🍆': 1} 223 return merged 224 225 def extract_output(self, accumulator): 226 # accumulator == {'🥕': 2, '🍅': 2, '🍆': 1} 227 total = sum(accumulator.values()) # 5 228 percentages = {item: count / total for item, count in accumulator.items()} 229 # percentages == {'🥕': 0.4, '🍅': 0.4, '🍆': 0.2} 230 return percentages 231 232 with beam.Pipeline() as pipeline: 233 percentages_per_season = ( 234 pipeline 235 | 'Create produce' >> beam.Create([ 236 ('spring', ['🥕', '🍅', '🥕', '🍅', '🍆']), 237 ('summer', ['🥕', '🍅', '🌽', '🍅', '🍅']), 238 ('fall', ['🥕', '🥕', '🍅', '🍅']), 239 ('winter', ['🍆', '🍆']), 240 ]) 241 | 'Average' >> beam.CombineValues(AverageFn()) 242 | beam.Map(print)) 243 # [END combinevalues_combinefn] 244 if test: 245 test(percentages_per_season)