github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/datastore/v1new/util.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 # Utility functions & classes that are _not_ specific to the datastore client. 19 # 20 # For internal use only; no backwards-compatibility guarantees. 21 22 # pytype: skip-file 23 24 import math 25 26 # Constants used in batched mutation RPCs: 27 WRITE_BATCH_INITIAL_SIZE = 50 28 # Max allowed Datastore writes per batch, and max bytes per batch. 29 # Note that the max bytes per batch set here is lower than the 10MB limit 30 # actually enforced by the API, to leave space for the CommitRequest wrapper 31 # around the mutations. 32 # https://cloud.google.com/datastore/docs/concepts/limits 33 WRITE_BATCH_MAX_SIZE = 500 34 WRITE_BATCH_MAX_BYTES_SIZE = 9000000 35 WRITE_BATCH_MIN_SIZE = 5 36 WRITE_BATCH_TARGET_LATENCY_MS = 6000 37 38 39 class MovingSum(object): 40 """Class that keeps track of a rolling window sum. 41 42 For use in tracking recent performance of the connector. 43 44 Intended to be similar to 45 org.apache.beam.sdk.util.MovingFunction(..., Sum.ofLongs()), but for 46 convenience we expose the count of entries as well so this doubles as a 47 moving average tracker. 48 """ 49 def __init__(self, window_ms, bucket_ms): 50 if window_ms < bucket_ms or bucket_ms <= 0: 51 raise ValueError("window_ms >= bucket_ms > 0 please") 52 self._num_buckets = int(math.ceil(window_ms / bucket_ms)) 53 self._bucket_ms = bucket_ms 54 self._Reset(now=0) # initialize the moving window members 55 56 def _Reset(self, now): 57 self._current_index = 0 # pointer into self._buckets 58 self._current_ms_since_epoch = math.floor( 59 now / self._bucket_ms) * self._bucket_ms 60 61 # _buckets is a list where each element is a list [sum, num_samples] 62 # This is a circular buffer where 63 # [_current_index] represents the time range 64 # [_current_ms_since_epoch, _current_ms_since_epoch+_bucket_ms) 65 # [_current_index-1] represents immediatly prior time range 66 # [_current_ms_since_epoch-_bucket_ms, _current_ms_since_epoch) 67 # etc, wrapping around from the start to the end of the array, so 68 # [_current_index+1] is the element representing the oldest bucket. 69 self._buckets = [[0, 0] for _ in range(0, self._num_buckets)] 70 71 def _Flush(self, now): 72 """ 73 74 Args: 75 now: int, milliseconds since epoch 76 """ 77 if now >= (self._current_ms_since_epoch + 78 self._bucket_ms * self._num_buckets): 79 # Time moved forward so far that all currently held data is outside of 80 # the window. It is faster to simply reset our data. 81 self._Reset(now) 82 return 83 84 while now > self._current_ms_since_epoch + self._bucket_ms: 85 # Advance time by one _bucket_ms, setting the new bucket's counts to 0. 86 self._current_ms_since_epoch += self._bucket_ms 87 self._current_index = (self._current_index + 1) % self._num_buckets 88 self._buckets[self._current_index] = [0, 0] 89 # Intentional dead reckoning here; we don't care about staying precisely 90 # aligned with multiples of _bucket_ms since the epoch, we just need our 91 # buckets to represent the most recent _window_ms time window. 92 93 def sum(self, now): 94 self._Flush(now) 95 return sum(bucket[0] for bucket in self._buckets) 96 97 def add(self, now, inc): 98 self._Flush(now) 99 bucket = self._buckets[self._current_index] 100 bucket[0] += inc 101 bucket[1] += 1 102 103 def count(self, now): 104 self._Flush(now) 105 return sum(bucket[1] for bucket in self._buckets) 106 107 def has_data(self, now): 108 return self.count(now) > 0 109 110 111 class DynamicBatchSizer(object): 112 """Determines request sizes for future Datastore RPCs.""" 113 def __init__(self): 114 self._commit_time_per_entity_ms = MovingSum( 115 window_ms=120000, bucket_ms=10000) 116 117 def get_batch_size(self, now): 118 """Returns the recommended size for datastore RPCs at this time.""" 119 if not self._commit_time_per_entity_ms.has_data(now): 120 return WRITE_BATCH_INITIAL_SIZE 121 122 recent_mean_latency_ms = ( 123 self._commit_time_per_entity_ms.sum(now) // 124 self._commit_time_per_entity_ms.count(now)) 125 return max( 126 WRITE_BATCH_MIN_SIZE, 127 min( 128 WRITE_BATCH_MAX_SIZE, 129 WRITE_BATCH_TARGET_LATENCY_MS // max(recent_mean_latency_ms, 1))) 130 131 def report_latency(self, now, latency_ms, num_mutations): 132 """Report the latency of a Datastore RPC. 133 134 Args: 135 now: double, completion time of the RPC as seconds since the epoch. 136 latency_ms: double, the observed latency in milliseconds for this RPC. 137 num_mutations: int, number of mutations contained in the RPC. 138 """ 139 self._commit_time_per_entity_ms.add(now, latency_ms / num_mutations)