github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/datastore/v1new/util.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/datastore/v1new/util.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  # Utility functions & classes that are _not_ specific to the datastore client.
    19  #
    20  # For internal use only; no backwards-compatibility guarantees.
    21  
    22  # pytype: skip-file
    23  
    24  import math
    25  
    26  # Constants used in batched mutation RPCs:
    27  WRITE_BATCH_INITIAL_SIZE = 50
    28  # Max allowed Datastore writes per batch, and max bytes per batch.
    29  # Note that the max bytes per batch set here is lower than the 10MB limit
    30  # actually enforced by the API, to leave space for the CommitRequest wrapper
    31  # around the mutations.
    32  # https://cloud.google.com/datastore/docs/concepts/limits
    33  WRITE_BATCH_MAX_SIZE = 500
    34  WRITE_BATCH_MAX_BYTES_SIZE = 9000000
    35  WRITE_BATCH_MIN_SIZE = 5
    36  WRITE_BATCH_TARGET_LATENCY_MS = 6000
    37  
    38  
    39  class MovingSum(object):
    40    """Class that keeps track of a rolling window sum.
    41  
    42    For use in tracking recent performance of the connector.
    43  
    44    Intended to be similar to
    45    org.apache.beam.sdk.util.MovingFunction(..., Sum.ofLongs()), but for
    46    convenience we expose the count of entries as well so this doubles as a
    47    moving average tracker.
    48    """
    49    def __init__(self, window_ms, bucket_ms):
    50      if window_ms < bucket_ms or bucket_ms <= 0:
    51        raise ValueError("window_ms >= bucket_ms > 0 please")
    52      self._num_buckets = int(math.ceil(window_ms / bucket_ms))
    53      self._bucket_ms = bucket_ms
    54      self._Reset(now=0)  # initialize the moving window members
    55  
    56    def _Reset(self, now):
    57      self._current_index = 0  # pointer into self._buckets
    58      self._current_ms_since_epoch = math.floor(
    59          now / self._bucket_ms) * self._bucket_ms
    60  
    61      # _buckets is a list where each element is a list [sum, num_samples]
    62      # This is a circular buffer where
    63      # [_current_index] represents the time range
    64      #     [_current_ms_since_epoch, _current_ms_since_epoch+_bucket_ms)
    65      # [_current_index-1] represents immediatly prior time range
    66      #     [_current_ms_since_epoch-_bucket_ms, _current_ms_since_epoch)
    67      # etc, wrapping around from the start to the end of the array, so
    68      # [_current_index+1] is the element representing the oldest bucket.
    69      self._buckets = [[0, 0] for _ in range(0, self._num_buckets)]
    70  
    71    def _Flush(self, now):
    72      """
    73  
    74      Args:
    75        now: int, milliseconds since epoch
    76      """
    77      if now >= (self._current_ms_since_epoch +
    78                 self._bucket_ms * self._num_buckets):
    79        # Time moved forward so far that all currently held data is outside of
    80        # the window.  It is faster to simply reset our data.
    81        self._Reset(now)
    82        return
    83  
    84      while now > self._current_ms_since_epoch + self._bucket_ms:
    85        # Advance time by one _bucket_ms, setting the new bucket's counts to 0.
    86        self._current_ms_since_epoch += self._bucket_ms
    87        self._current_index = (self._current_index + 1) % self._num_buckets
    88        self._buckets[self._current_index] = [0, 0]
    89        # Intentional dead reckoning here; we don't care about staying precisely
    90        # aligned with multiples of _bucket_ms since the epoch, we just need our
    91        # buckets to represent the most recent _window_ms time window.
    92  
    93    def sum(self, now):
    94      self._Flush(now)
    95      return sum(bucket[0] for bucket in self._buckets)
    96  
    97    def add(self, now, inc):
    98      self._Flush(now)
    99      bucket = self._buckets[self._current_index]
   100      bucket[0] += inc
   101      bucket[1] += 1
   102  
   103    def count(self, now):
   104      self._Flush(now)
   105      return sum(bucket[1] for bucket in self._buckets)
   106  
   107    def has_data(self, now):
   108      return self.count(now) > 0
   109  
   110  
   111  class DynamicBatchSizer(object):
   112    """Determines request sizes for future Datastore RPCs."""
   113    def __init__(self):
   114      self._commit_time_per_entity_ms = MovingSum(
   115          window_ms=120000, bucket_ms=10000)
   116  
   117    def get_batch_size(self, now):
   118      """Returns the recommended size for datastore RPCs at this time."""
   119      if not self._commit_time_per_entity_ms.has_data(now):
   120        return WRITE_BATCH_INITIAL_SIZE
   121  
   122      recent_mean_latency_ms = (
   123          self._commit_time_per_entity_ms.sum(now) //
   124          self._commit_time_per_entity_ms.count(now))
   125      return max(
   126          WRITE_BATCH_MIN_SIZE,
   127          min(
   128              WRITE_BATCH_MAX_SIZE,
   129              WRITE_BATCH_TARGET_LATENCY_MS // max(recent_mean_latency_ms, 1)))
   130  
   131    def report_latency(self, now, latency_ms, num_mutations):
   132      """Report the latency of a Datastore RPC.
   133  
   134      Args:
   135        now: double, completion time of the RPC as seconds since the epoch.
   136        latency_ms: double, the observed latency in milliseconds for this RPC.
   137        num_mutations: int, number of mutations contained in the RPC.
   138      """
   139      self._commit_time_per_entity_ms.add(now, latency_ms / num_mutations)