github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/utils/counters.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 # cython: profile=False 19 # cython: overflowcheck=True 20 # cython: language_level=3 21 22 """Counters collect the progress of the Worker for reporting to the service. 23 24 For internal use only; no backwards-compatibility guarantees. 25 """ 26 27 # pytype: skip-file 28 29 import threading 30 from collections import namedtuple 31 from typing import TYPE_CHECKING 32 from typing import Dict 33 34 from apache_beam.transforms import cy_combiners 35 36 if TYPE_CHECKING: 37 from apache_beam.transforms import core 38 39 # Information identifying the IO being measured by a counter. 40 # 41 # A CounterName with IOTarget helps identify the IO being measured by a 42 # counter. 43 # 44 # It may represent the consumption of Shuffle IO, or the consumption of 45 # side inputs. The way in which each is represented is explained in the 46 # documentation of the side_input_id, and shuffle_id functions. 47 IOTargetName = namedtuple( 48 'IOTargetName', ['requesting_step_name', 'input_index']) 49 50 51 def side_input_id(step_name, input_index): 52 # type: (str, int) -> IOTargetName 53 54 """Create an IOTargetName that identifies the reading of a side input. 55 56 Given a step "s4" that receives two side inputs, then the CounterName 57 that represents the consumption of side input number 2 is: 58 * step_name: s4 <---| 59 * input_index: 2 <---|-- Identifying the side input itself 60 * requesting_step_name: s4 <-- Identifying the step that reads from it. 61 62 If "s4" emits the whole AsIter of the side input, down to a step, say "s5", 63 then the requesting_step_name of the subsequent consumption will be "s5". 64 """ 65 return IOTargetName(step_name, input_index) 66 67 68 def shuffle_id(step_name): 69 # type: (str) -> IOTargetName 70 71 """Create an IOTargetName that identifies a GBK step. 72 73 Given a step "s6" that is downstream from a GBK "s5", then "s6" will read 74 from shuffle. The CounterName that quantifies the consumption of data from 75 shuffle has: 76 * step_name: s5 77 * requesting_step_name: s6 78 79 If "s6" emits the whole iterable down to a step, say "s7", and "s7" continues 80 to consume data from the iterable, then a new CounterName will be: 81 * step_name: s5 <--- Identifying the GBK 82 * requesting_step_name: s6 83 """ 84 return IOTargetName(step_name, None) 85 86 87 _CounterName = namedtuple( 88 '_CounterName', 89 [ 90 'name', 91 'stage_name', 92 'step_name', 93 'system_name', 94 'namespace', 95 'origin', 96 'output_index', 97 'io_target' 98 ]) 99 100 101 class CounterName(_CounterName): 102 """Naming information for a counter.""" 103 SYSTEM = object() 104 USER = object() 105 106 def __new__( 107 cls, 108 name, 109 stage_name=None, 110 step_name=None, 111 system_name=None, 112 namespace=None, 113 origin=None, 114 output_index=None, 115 io_target=None): 116 origin = origin or CounterName.SYSTEM 117 return super().__new__( 118 cls, 119 name, 120 stage_name, 121 step_name, 122 system_name, 123 namespace, 124 origin, 125 output_index, 126 io_target) 127 128 def __repr__(self): 129 return '<CounterName<%s> at %s>' % (self._str_internal(), hex(id(self))) 130 131 def __str__(self): 132 return self._str_internal() 133 134 def _str_internal(self): 135 if self.origin == CounterName.USER: 136 return 'user-%s-%s' % (self.step_name, self.name) 137 elif self.origin == CounterName.SYSTEM and self.output_index: 138 return '%s-out%s-%s' % (self.step_name, self.output_index, self.name) 139 else: 140 return '%s-%s-%s' % (self.stage_name, self.step_name, self.name) 141 142 143 class Counter(object): 144 """A counter aggregates a series of values. 145 146 The aggregation kind of the Counter is specified when the Counter 147 is created. The values aggregated must be of an appropriate for the 148 aggregation used. Aggregations supported are listed in the code. 149 150 (The aggregated value will be reported to the Dataflow service.) 151 152 Do not create directly; call CounterFactory.get_counter instead. 153 154 Attributes: 155 name: the name of the counter, a string 156 combine_fn: the CombineFn to use for aggregation 157 accumulator: the accumulator created for the combine_fn 158 """ 159 160 # Handy references to common counters. 161 SUM = cy_combiners.SumInt64Fn() 162 MEAN = cy_combiners.MeanInt64Fn() 163 BEAM_DISTRIBUTION = cy_combiners.DistributionInt64Fn() 164 165 # Dataflow Distribution Accumulator Fn. 166 # TODO(https://github.com/apache/beam/issues/18843): Generalize distribution 167 # counter if necessary. 168 DATAFLOW_DISTRIBUTION = cy_combiners.DataflowDistributionCounterFn() 169 170 def __init__(self, name, combine_fn): 171 # type: (CounterName, core.CombineFn) -> None 172 173 """Creates a Counter object. 174 175 Args: 176 name: the name of this counter. It may be a string, 177 or a CounterName object. 178 combine_fn: the CombineFn to use for aggregation 179 """ 180 self.name = name 181 self.combine_fn = combine_fn 182 self.accumulator = combine_fn.create_accumulator() 183 self._add_input = self.combine_fn.add_input 184 185 def update(self, value): 186 self.accumulator = self._add_input(self.accumulator, value) 187 188 def update_n(self, value, n): 189 """Update the counter with the same value N times""" 190 for _ in range(n): 191 self.accumulator = self._add_input(self, value) 192 193 def reset(self, value): 194 self.accumulator = self.combine_fn.create_accumulator() 195 196 def value(self): 197 return self.combine_fn.extract_output(self.accumulator) 198 199 def __str__(self): 200 return '<%s>' % self._str_internal() 201 202 def __repr__(self): 203 return '<%s at %s>' % (self._str_internal(), hex(id(self))) 204 205 def _str_internal(self): 206 return '%s %s %s' % ( 207 self.name, self.combine_fn.__class__.__name__, self.value()) 208 209 210 class AccumulatorCombineFnCounter(Counter): 211 """Counter optimized for a mutating accumulator that holds all the logic.""" 212 def __init__(self, name, combine_fn): 213 # type: (CounterName, cy_combiners.AccumulatorCombineFn) -> None 214 assert isinstance(combine_fn, cy_combiners.AccumulatorCombineFn) 215 super().__init__(name, combine_fn) 216 self.reset() 217 218 def update(self, value): 219 self._fast_add_input(value) 220 221 def update_n(self, value, n): 222 self._fast_add_input_n(value, n) 223 224 def reset(self): 225 self.accumulator = self.combine_fn.create_accumulator() 226 self._fast_add_input = self.accumulator.add_input 227 self._fast_add_input_n = self.accumulator.add_input_n 228 229 230 class CounterFactory(object): 231 """Keeps track of unique counters.""" 232 def __init__(self): 233 self.counters = {} # type: Dict[CounterName, Counter] 234 235 # Lock to be acquired when accessing the counters map. 236 self._lock = threading.Lock() 237 238 def get_counter(self, name, combine_fn): 239 # type: (CounterName, core.CombineFn) -> Counter 240 241 """Returns a counter with the requested name. 242 243 Passing in the same name will return the same counter; the 244 combine_fn must agree. 245 246 Args: 247 name: the name of this counter. Typically has three parts: 248 "step-output-counter". 249 combine_fn: the CombineFn to use for aggregation 250 Returns: 251 A new or existing counter with the requested name. 252 """ 253 with self._lock: 254 counter = self.counters.get(name, None) 255 if counter: 256 assert counter.combine_fn == combine_fn 257 else: 258 if isinstance(combine_fn, cy_combiners.AccumulatorCombineFn): 259 counter = AccumulatorCombineFnCounter(name, combine_fn) 260 else: 261 counter = Counter(name, combine_fn) 262 self.counters[name] = counter 263 return counter 264 265 def reset(self): 266 # Counters are cached in state sampler states. 267 with self._lock: 268 for counter in self.counters.values(): 269 counter.reset() 270 271 def get_counters(self): 272 """Returns the current set of counters. 273 274 Returns: 275 An iterable that contains the current set of counters. To make sure that 276 multiple threads can iterate over the set of counters, we return a new 277 iterable here. Note that the actual set of counters may get modified after 278 this method returns hence the returned iterable may be stale. 279 """ 280 with self._lock: 281 return self.counters.values() # pylint: disable=bad-option-value