github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/testing/synthetic_pipeline.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/testing/synthetic_pipeline.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """A set of utilities to write pipelines for performance tests.
    19  
    20  This module offers a way to create pipelines using synthetic sources and steps.
    21  Exact shape of the pipeline and the behaviour of sources and steps can be
    22  controlled through arguments. Please see function 'parse_args()' for more
    23  details about the arguments.
    24  
    25  Shape of the pipeline is primarily controlled through two arguments. Argument
    26  'steps' can be used to define a list of steps as a JSON string. Argument
    27  'barrier' describes how these steps are separated from each other. Argument
    28  'barrier' can be use to build a pipeline as a series of steps or a tree of
    29  steps with a fanin or a fanout of size 2.
    30  
    31  Other arguments describe what gets generated by synthetic sources that produce
    32  data for the pipeline.
    33  """
    34  
    35  # pytype: skip-file
    36  
    37  import argparse
    38  import json
    39  import logging
    40  import math
    41  import os
    42  import sys
    43  import time
    44  from random import Random
    45  from typing import Tuple
    46  
    47  import apache_beam as beam
    48  from apache_beam import pvalue
    49  from apache_beam import typehints
    50  from apache_beam.io import WriteToText
    51  from apache_beam.io import iobase
    52  from apache_beam.io import range_trackers
    53  from apache_beam.io import restriction_trackers
    54  from apache_beam.io.restriction_trackers import OffsetRange
    55  from apache_beam.io.restriction_trackers import OffsetRestrictionTracker
    56  from apache_beam.options.pipeline_options import PipelineOptions
    57  from apache_beam.options.pipeline_options import SetupOptions
    58  from apache_beam.testing.test_pipeline import TestPipeline
    59  from apache_beam.transforms import userstate
    60  from apache_beam.transforms.core import RestrictionProvider
    61  
    62  try:
    63    import numpy as np
    64  except ImportError:
    65    np = None
    66  
    67  
    68  class _Random(Random):
    69    """A subclass of `random.Random` from the Python Standard Library that
    70    provides a method returning random bytes of arbitrary length.
    71    """
    72  
    73    # `numpy.random.RandomState` does not provide `random()` method, we keep this
    74    # for compatibility reasons.
    75    random_sample = Random.random
    76  
    77    def bytes(self, length):
    78      """Returns random bytes.
    79  
    80      Args:
    81        length (int): Number of random bytes.
    82      """
    83      return self.getrandbits(length * 8).to_bytes(length, sys.byteorder)
    84  
    85  
    86  Generator = _Random
    87  
    88  
    89  def parse_byte_size(s):
    90    suffixes = 'BKMGTP'
    91    if s[-1] in suffixes:
    92      return int(float(s[:-1]) * 1024**suffixes.index(s[-1]))
    93  
    94    return int(s)
    95  
    96  
    97  def div_round_up(a, b):
    98    """Return ceil(a/b)."""
    99    return int(math.ceil(float(a) / b))
   100  
   101  
   102  def rotate_key(element):
   103    """Returns a new key-value pair of the same size but with a different key."""
   104    (key, value) = element
   105    return key[-1:] + key[:-1], value
   106  
   107  
   108  def initial_splitting_zipf(
   109      start_position,
   110      stop_position,
   111      desired_num_bundles,
   112      distribution_parameter,
   113      num_total_records=None):
   114    """Split the given range (defined by start_position, stop_position) into
   115       desired_num_bundles using zipf with the given distribution_parameter.
   116    """
   117    if not num_total_records:
   118      num_total_records = stop_position - start_position
   119    samples = np.random.zipf(distribution_parameter, desired_num_bundles)
   120    total = sum(samples)
   121    relative_bundle_sizes = [(float(sample) / total) for sample in samples]
   122    bundle_ranges = []
   123    start = start_position
   124    index = 0
   125    while start < stop_position:
   126      if index == desired_num_bundles - 1:
   127        bundle_ranges.append((start, stop_position))
   128        break
   129      stop = start + int(num_total_records * relative_bundle_sizes[index])
   130      bundle_ranges.append((start, stop))
   131      start = stop
   132      index += 1
   133    return bundle_ranges
   134  
   135  
   136  class SyntheticStep(beam.DoFn):
   137    """A DoFn of which behavior can be controlled through prespecified parameters.
   138    """
   139    def __init__(
   140        self,
   141        per_element_delay_sec=0,
   142        per_bundle_delay_sec=0,
   143        output_records_per_input_record=1,
   144        output_filter_ratio=0):
   145      if per_element_delay_sec and per_element_delay_sec < 1e-3:
   146        raise ValueError(
   147            'Per element sleep time must be at least 1e-3. '
   148            'Received: %r',
   149            per_element_delay_sec)
   150      self._per_element_delay_sec = per_element_delay_sec
   151      self._per_bundle_delay_sec = per_bundle_delay_sec
   152      self._output_records_per_input_record = output_records_per_input_record
   153      self._output_filter_ratio = output_filter_ratio
   154  
   155    def start_bundle(self):
   156      self._start_time = time.time()
   157  
   158    def finish_bundle(self):
   159      # The target is for the enclosing stage to take as close to as possible
   160      # the given number of seconds, so we only sleep enough to make up for
   161      # overheads not incurred elsewhere.
   162      to_sleep = self._per_bundle_delay_sec - (time.time() - self._start_time)
   163  
   164      # Ignoring sub-millisecond sleep times.
   165      if to_sleep >= 1e-3:
   166        time.sleep(to_sleep)
   167  
   168    def process(self, element):
   169      if self._per_element_delay_sec >= 1e-3:
   170        time.sleep(self._per_element_delay_sec)
   171      filter_element = False
   172      if self._output_filter_ratio > 0:
   173        if np.random.random() < self._output_filter_ratio:
   174          filter_element = True
   175  
   176      if not filter_element:
   177        for _ in range(self._output_records_per_input_record):
   178          yield element
   179  
   180  
   181  class NonLiquidShardingOffsetRangeTracker(OffsetRestrictionTracker):
   182    """An OffsetRangeTracker that doesn't allow splitting. """
   183    def try_split(self, split_offset):
   184      pass  # Don't split.
   185  
   186    def checkpoint(self):
   187      pass  # Don't split.
   188  
   189  
   190  class SyntheticSDFStepRestrictionProvider(RestrictionProvider):
   191    """A `RestrictionProvider` for SyntheticSDFStep.
   192  
   193    An initial_restriction and split that operate on num_records and ignores
   194    source description (element). Splits into initial_splitting_num_bundles.
   195    Returns size_estimate_override as restriction size, if set. Otherwise uses
   196    element size.
   197  
   198    If initial_splitting_uneven_chunks, produces uneven chunks.
   199  
   200    """
   201    def __init__(
   202        self,
   203        num_records,
   204        initial_splitting_num_bundles,
   205        initial_splitting_uneven_chunks,
   206        disable_liquid_sharding,
   207        size_estimate_override):
   208      self._num_records = num_records
   209      self._initial_splitting_num_bundles = initial_splitting_num_bundles
   210      self._initial_splitting_uneven_chunks = initial_splitting_uneven_chunks
   211      self._disable_liquid_sharding = disable_liquid_sharding
   212      self._size_estimate_override = size_estimate_override
   213  
   214    def initial_restriction(self, element):
   215      return OffsetRange(0, self._num_records)
   216  
   217    def create_tracker(self, restriction):
   218      if self._disable_liquid_sharding:
   219        return NonLiquidShardingOffsetRangeTracker(restriction)
   220      else:
   221        return OffsetRestrictionTracker(restriction)
   222  
   223    def split(self, element, restriction):
   224      elems = restriction.size()
   225      if (self._initial_splitting_uneven_chunks and
   226          self._initial_splitting_num_bundles > 1 and elems > 1):
   227        bundle_ranges = initial_splitting_zipf(
   228            restriction.start,
   229            restriction.stop,
   230            self._initial_splitting_num_bundles,
   231            3.0)
   232        for start, stop in bundle_ranges:
   233          yield OffsetRange(start, stop)
   234  
   235      else:
   236        offsets_per_split = max(1, (elems // self._initial_splitting_num_bundles))
   237        for split in restriction.split(offsets_per_split, offsets_per_split // 2):
   238          yield split
   239  
   240    def restriction_size(self, element, restriction):
   241      if self._size_estimate_override is not None:
   242        return self._size_estimate_override
   243      element_size = len(element) if isinstance(element, str) else 1
   244      return restriction.size() * element_size
   245  
   246  
   247  def get_synthetic_sdf_step(
   248      per_element_delay_sec=0,
   249      per_bundle_delay_sec=0,
   250      output_records_per_input_record=1,
   251      output_filter_ratio=0,
   252      initial_splitting_num_bundles=8,
   253      initial_splitting_uneven_chunks=False,
   254      disable_liquid_sharding=False,
   255      size_estimate_override=None,
   256  ):
   257    """A function which returns a SyntheticSDFStep with given parameters. """
   258    class SyntheticSDFStep(beam.DoFn):
   259      """A SplittableDoFn of which behavior can be controlled through prespecified
   260         parameters.
   261      """
   262      def __init__(
   263          self,
   264          per_element_delay_sec_arg,
   265          per_bundle_delay_sec_arg,
   266          output_filter_ratio_arg,
   267          output_records_per_input_record_arg):
   268        if per_element_delay_sec_arg:
   269          per_element_delay_sec_arg = (
   270              per_element_delay_sec_arg // output_records_per_input_record_arg)
   271          if per_element_delay_sec_arg < 1e-3:
   272            raise ValueError(
   273                'Per element sleep time must be at least 1e-3 after being '
   274                'divided among output elements.')
   275        self._per_element_delay_sec = per_element_delay_sec_arg
   276        self._per_bundle_delay_sec = per_bundle_delay_sec_arg
   277        self._output_filter_ratio = output_filter_ratio_arg
   278  
   279      def start_bundle(self):
   280        self._start_time = time.time()
   281  
   282      def finish_bundle(self):
   283        # The target is for the enclosing stage to take as close to as possible
   284        # the given number of seconds, so we only sleep enough to make up for
   285        # overheads not incurred elsewhere.
   286        to_sleep = self._per_bundle_delay_sec - (time.time() - self._start_time)
   287  
   288        # Ignoring sub-millisecond sleep times.
   289        if to_sleep >= 1e-3:
   290          time.sleep(to_sleep)
   291  
   292      def process(
   293          self,
   294          element,
   295          restriction_tracker=beam.DoFn.RestrictionParam(
   296              SyntheticSDFStepRestrictionProvider(
   297                  output_records_per_input_record,
   298                  initial_splitting_num_bundles,
   299                  initial_splitting_uneven_chunks,
   300                  disable_liquid_sharding,
   301                  size_estimate_override))):
   302        filter_element = False
   303        if self._output_filter_ratio > 0:
   304          if np.random.random() < self._output_filter_ratio:
   305            filter_element = True
   306  
   307        current_restriction = restriction_tracker.current_restriction()
   308        for cur in range(current_restriction.start, current_restriction.stop):
   309          if not restriction_tracker.try_claim(cur):
   310            return
   311  
   312          if self._per_element_delay_sec:
   313            time.sleep(self._per_element_delay_sec)
   314  
   315          if not filter_element:
   316            yield element
   317          cur += 1
   318  
   319    return SyntheticSDFStep(
   320        per_element_delay_sec,
   321        per_bundle_delay_sec,
   322        output_filter_ratio,
   323        output_records_per_input_record)
   324  
   325  
   326  class SyntheticSource(iobase.BoundedSource):
   327    """A custom source of a specified size.
   328    """
   329    def __init__(self, input_spec):
   330      """Initiates a synthetic source.
   331  
   332      Args:
   333        input_spec: Input specification of the source. See corresponding option in
   334                    function 'parse_args()' below for more details.
   335      Raises:
   336        ValueError: if input parameters are invalid.
   337      """
   338      def maybe_parse_byte_size(s):
   339        return parse_byte_size(s) if isinstance(s, str) else int(s)
   340  
   341      self._num_records = input_spec['numRecords']
   342      self._key_size = maybe_parse_byte_size(input_spec.get('keySizeBytes', 1))
   343      self._hot_key_fraction = input_spec.get('hotKeyFraction', 0)
   344      self._num_hot_keys = input_spec.get('numHotKeys', 0)
   345  
   346      self._value_size = maybe_parse_byte_size(
   347          input_spec.get('valueSizeBytes', 1))
   348      self._total_size = self.element_size * self._num_records
   349      self._initial_splitting = (
   350          input_spec['bundleSizeDistribution']['type']
   351          if 'bundleSizeDistribution' in input_spec else 'const')
   352      if self._initial_splitting != 'const' and self._initial_splitting != 'zipf':
   353        raise ValueError(
   354            'Only const and zipf distributions are supported for determining '
   355            'sizes of bundles produced by initial splitting. Received: %s',
   356            self._initial_splitting)
   357      self._initial_splitting_num_bundles = (
   358          input_spec['forceNumInitialBundles']
   359          if 'forceNumInitialBundles' in input_spec else 0)
   360      if self._initial_splitting == 'zipf':
   361        self._initial_splitting_distribution_parameter = (
   362            input_spec['bundleSizeDistribution']['param'])
   363        if self._initial_splitting_distribution_parameter < 1:
   364          raise ValueError(
   365              'Parameter for a Zipf distribution must be larger than 1. '
   366              'Received %r.',
   367              self._initial_splitting_distribution_parameter)
   368      else:
   369        self._initial_splitting_distribution_parameter = 0
   370      self._dynamic_splitting = (
   371          'none' if (
   372              'splitPointFrequencyRecords' in input_spec and
   373              input_spec['splitPointFrequencyRecords'] == 0) else 'perfect')
   374      if 'delayDistribution' in input_spec:
   375        if input_spec['delayDistribution']['type'] != 'const':
   376          raise ValueError(
   377              'SyntheticSource currently only supports delay '
   378              'distributions of type \'const\'. Received %s.',
   379              input_spec['delayDistribution']['type'])
   380        self._sleep_per_input_record_sec = (
   381            float(input_spec['delayDistribution']['const']) / 1000)
   382        if (self._sleep_per_input_record_sec and
   383            self._sleep_per_input_record_sec < 1e-3):
   384          raise ValueError(
   385              'Sleep time per input record must be at least 1e-3.'
   386              ' Received: %r',
   387              self._sleep_per_input_record_sec)
   388      else:
   389        self._sleep_per_input_record_sec = 0
   390  
   391    @property
   392    def element_size(self):
   393      return self._key_size + self._value_size
   394  
   395    def estimate_size(self):
   396      return self._total_size
   397  
   398    def split(self, desired_bundle_size, start_position=0, stop_position=None):
   399      # Performs initial splitting of SyntheticSource.
   400      #
   401      # Exact sizes and distribution of initial splits generated here depends on
   402      # the input specification of the SyntheticSource.
   403  
   404      if stop_position is None:
   405        stop_position = self._num_records
   406      if self._initial_splitting == 'zipf':
   407        desired_num_bundles = self._initial_splitting_num_bundles or math.ceil(
   408            float(self.estimate_size()) / desired_bundle_size)
   409        bundle_ranges = initial_splitting_zipf(
   410            start_position,
   411            stop_position,
   412            desired_num_bundles,
   413            self._initial_splitting_distribution_parameter,
   414            self._num_records)
   415      else:
   416        if self._initial_splitting_num_bundles:
   417          bundle_size_in_elements = max(
   418              1, int(self._num_records / self._initial_splitting_num_bundles))
   419        else:
   420          bundle_size_in_elements = (
   421              max(
   422                  div_round_up(desired_bundle_size, self.element_size),
   423                  int(math.floor(math.sqrt(self._num_records)))))
   424        bundle_ranges = []
   425        for start in range(start_position, stop_position,
   426                           bundle_size_in_elements):
   427          stop = min(start + bundle_size_in_elements, stop_position)
   428          bundle_ranges.append((start, stop))
   429  
   430      for start, stop in bundle_ranges:
   431        yield iobase.SourceBundle(stop - start, self, start, stop)
   432  
   433    def get_range_tracker(self, start_position, stop_position):
   434      if start_position is None:
   435        start_position = 0
   436      if stop_position is None:
   437        stop_position = self._num_records
   438      tracker = range_trackers.OffsetRangeTracker(start_position, stop_position)
   439      if self._dynamic_splitting == 'none':
   440        tracker = range_trackers.UnsplittableRangeTracker(tracker)
   441      return tracker
   442  
   443    def _gen_kv_pair(self, generator, index):
   444      generator.seed(index)
   445      rand = generator.random_sample()
   446  
   447      # Determines whether to generate hot key or not.
   448      if rand < self._hot_key_fraction:
   449        # Generate hot key.
   450        # An integer is randomly selected from the range [0, numHotKeys-1]
   451        # with equal probability.
   452        generator_hot = Generator(index % self._num_hot_keys)
   453        bytes_ = generator_hot.bytes(self._key_size), generator.bytes(
   454          self._value_size)
   455      else:
   456        bytes_ = generator.bytes(self.element_size)
   457        bytes_ = bytes_[:self._key_size], bytes_[self._key_size:]
   458      return bytes_
   459  
   460    def read(self, range_tracker):
   461      index = range_tracker.start_position()
   462      generator = Generator()
   463      while range_tracker.try_claim(index):
   464        time.sleep(self._sleep_per_input_record_sec)
   465        yield self._gen_kv_pair(generator, index)
   466        index += 1
   467  
   468    def default_output_coder(self):
   469      return beam.coders.TupleCoder(
   470          [beam.coders.BytesCoder(), beam.coders.BytesCoder()])
   471  
   472  
   473  class SyntheticSDFSourceRestrictionProvider(RestrictionProvider):
   474    """A `RestrictionProvider` for SyntheticSDFAsSource.
   475  
   476    In initial_restriction(element) and split(element), element means source
   477    description.
   478    A typical element is like:
   479  
   480      {
   481        'key_size': 1,
   482        'value_size': 1,
   483        'initial_splitting_num_bundles': 8,
   484        'initial_splitting_desired_bundle_size': 2,
   485        'sleep_per_input_record_sec': 0,
   486        'initial_splitting' : 'const'
   487  
   488      }
   489  
   490    """
   491    def initial_restriction(self, element):
   492      return OffsetRange(0, element['num_records'])
   493  
   494    def create_tracker(self, restriction):
   495      return restriction_trackers.OffsetRestrictionTracker(restriction)
   496  
   497    def split(self, element, restriction):
   498      bundle_ranges = []
   499      start_position = restriction.start
   500      stop_position = restriction.stop
   501      element_size = element['key_size'] + element['value_size']
   502      estimate_size = element_size * element['num_records']
   503      if element['initial_splitting'] == 'zipf':
   504        desired_num_bundles = (
   505            element['initial_splitting_num_bundles'] or div_round_up(
   506                estimate_size, element['initial_splitting_desired_bundle_size']))
   507        samples = np.random.zipf(
   508            element['initial_splitting_distribution_parameter'],
   509            desired_num_bundles)
   510        total = sum(samples)
   511        relative_bundle_sizes = [(float(sample) / total) for sample in samples]
   512        start = start_position
   513        index = 0
   514        while start < stop_position:
   515          if index == desired_num_bundles - 1:
   516            bundle_ranges.append(OffsetRange(start, stop_position))
   517            break
   518          stop = start + int(
   519              element['num_records'] * relative_bundle_sizes[index])
   520          bundle_ranges.append(OffsetRange(start, stop))
   521          start = stop
   522          index += 1
   523      else:
   524        if element['initial_splitting_num_bundles']:
   525          bundle_size_in_elements = max(
   526              1,
   527              int(
   528                  element['num_records'] /
   529                  element['initial_splitting_num_bundles']))
   530        else:
   531          bundle_size_in_elements = (
   532              max(
   533                  div_round_up(
   534                      element['initial_splitting_desired_bundle_size'],
   535                      element_size),
   536                  int(math.floor(math.sqrt(element['num_records'])))))
   537        for start in range(start_position, stop_position,
   538                           bundle_size_in_elements):
   539          stop = min(start + bundle_size_in_elements, stop_position)
   540          bundle_ranges.append(OffsetRange(start, stop))
   541      return bundle_ranges
   542  
   543    def restriction_size(self, element, restriction):
   544      return (element['key_size'] + element['value_size']) * restriction.size()
   545  
   546  
   547  class SyntheticSDFAsSource(beam.DoFn):
   548    """A SDF that generates records like a source.
   549  
   550    This SDF accepts a PCollection of record-based source description.
   551    A typical description is like:
   552  
   553      {
   554        'key_size': 1,
   555        'value_size': 1,
   556        'initial_splitting_num_bundles': 8,
   557        'initial_splitting_desired_bundle_size': 2,
   558        'sleep_per_input_record_sec': 0,
   559        'initial_splitting' : 'const'
   560  
   561      }
   562  
   563    A simple pipeline taking this SDF as a source is like:
   564      p
   565      | beam.Create([description1, description2,...])
   566      | beam.ParDo(SyntheticSDFAsSource())
   567  
   568    NOTE:
   569      The SDF.process() will have different param content between defining a DoFn
   570      and runtime.
   571      When defining an SDF.process, the restriction_tracker should be a
   572      `RestrictionProvider`.
   573      During runtime, the DoFnRunner.process_with_sized_restriction() will feed
   574      a 'RestrictionTracker' based on a restriction to SDF.process().
   575    """
   576    def process(
   577        self,
   578        element,
   579        restriction_tracker=beam.DoFn.RestrictionParam(
   580            SyntheticSDFSourceRestrictionProvider())):
   581      cur = restriction_tracker.current_restriction().start
   582      while restriction_tracker.try_claim(cur):
   583        r = Generator()
   584        r.seed(cur)
   585        time.sleep(element['sleep_per_input_record_sec'])
   586        yield r.bytes(element['key_size']), r.bytes(element['value_size'])
   587        cur += 1
   588  
   589  
   590  class ShuffleBarrier(beam.PTransform):
   591    def expand(self, pc):
   592      return (
   593          pc
   594          | beam.Map(rotate_key)
   595          | beam.GroupByKey()
   596          | 'Ungroup' >> beam.FlatMap(lambda elm: [(elm[0], v) for v in elm[1]]))
   597  
   598  
   599  class SideInputBarrier(beam.PTransform):
   600    def expand(self, pc):
   601      return (
   602          pc
   603          | beam.Map(rotate_key)
   604          | beam.Map(
   605              lambda elem,
   606              ignored: elem,
   607              beam.pvalue.AsIter(pc | beam.FlatMap(lambda elem: None))))
   608  
   609  
   610  def merge_using_gbk(name, pc1, pc2):
   611    """Merges two given PCollections using a CoGroupByKey."""
   612  
   613    pc1_with_key = pc1 | (name + 'AttachKey1') >> beam.Map(lambda x: (x, x))
   614    pc2_with_key = pc2 | (name + 'AttachKey2') >> beam.Map(lambda x: (x, x))
   615  
   616    grouped = ({
   617        'pc1': pc1_with_key, 'pc2': pc2_with_key
   618    } | (name + 'Group') >> beam.CoGroupByKey())
   619    return (
   620        grouped | (name + 'DeDup') >> beam.Map(lambda elm: elm[0])
   621    )  # Ignoring values
   622  
   623  
   624  def merge_using_side_input(name, pc1, pc2):
   625    """Merges two given PCollections using side inputs."""
   626    def join_fn(val, _):  # Ignoring side input
   627      return val
   628  
   629    return pc1 | name >> beam.core.Map(join_fn, beam.pvalue.AsIter(pc2))
   630  
   631  
   632  def expand_using_gbk(name, pc):
   633    """Expands a given PCollection into two copies using GroupByKey."""
   634  
   635    ret = []
   636    ret.append((pc | ('%s.a' % name) >> ShuffleBarrier()))
   637    ret.append((pc | ('%s.b' % name) >> ShuffleBarrier()))
   638    return ret
   639  
   640  
   641  def expand_using_second_output(name, pc):
   642    """Expands a given PCollection into two copies using side outputs."""
   643    class ExpandFn(beam.DoFn):
   644      def process(self, element):
   645        yield beam.pvalue.TaggedOutput('second_out', element)
   646        yield element
   647  
   648    pc1, pc2 = (pc | name >> beam.ParDo(
   649        ExpandFn()).with_outputs('second_out', main='main_out'))
   650    return [pc1, pc2]
   651  
   652  
   653  def _parse_steps(json_str):
   654    """Converts the JSON step description into Python objects.
   655  
   656    See property 'steps' for more details about the JSON step description.
   657  
   658    Args:
   659      json_str: a JSON string that describes the steps.
   660  
   661    Returns:
   662      Information about steps as a list of dictionaries. Each dictionary may have
   663      following properties.
   664      (1) per_element_delay - amount of delay for each element in seconds.
   665      (2) per_bundle_delay - minimum amount of delay for a given step in seconds.
   666      (3) output_records_per_input_record - number of output elements generated
   667          for each input element to a step.
   668      (4) output_filter_ratio - the probability at which a step may filter out a
   669          given element by not producing any output for that element.
   670      (5) splittable - if the step should be splittable.
   671      (6) initial_splitting_num_bundles - number of bundles initial split if step
   672          is splittable.
   673      (7) initial_splitting_uneven_chunks - if the bundles should be
   674          unevenly-sized
   675      (8) disable_liquid_sharding - if liquid sharding should be disabled
   676      (9) size_estimate_override - the size estimate or None to use default
   677    """
   678    all_steps = []
   679    json_data = json.loads(json_str)
   680    for val in json_data:
   681      steps = {}
   682      steps['per_element_delay'] = ((float(val['per_element_delay_msec']) / 1000)
   683                                    if 'per_element_delay_msec' in val else 0)
   684      steps['per_bundle_delay'] = (
   685          float(val['per_bundle_delay_sec'])
   686          if 'per_bundle_delay_sec' in val else 0)
   687      steps['output_records_per_input_record'] = (
   688          int(val['output_records_per_input_record'])
   689          if 'output_records_per_input_record' in val else 1)
   690      steps['output_filter_ratio'] = (
   691          float(val['output_filter_ratio'])
   692          if 'output_filter_ratio' in val else 0)
   693      steps['splittable'] = (
   694          bool(val['splittable']) if 'splittable' in val else False)
   695      steps['initial_splitting_num_bundles'] = (
   696          int(val['initial_splitting_num_bundles'])
   697          if 'initial_splitting_num_bundles' in val else 8)
   698      steps['initial_splitting_uneven_chunks'] = (
   699          bool(val['initial_splitting_uneven_chunks'])
   700          if 'initial_splitting_uneven_chunks' in val else False)
   701      steps['disable_liquid_sharding'] = (
   702          bool(val['disable_liquid_sharding'])
   703          if 'disable_liquid_sharding' in val else False)
   704      steps['size_estimate_override'] = (
   705          int(val['size_estimate_override'])
   706          if 'size_estimate_override' in val else None)
   707      all_steps.append(steps)
   708  
   709    return all_steps
   710  
   711  
   712  def parse_args(args):
   713    """Parses a given set of arguments.
   714  
   715    Args:
   716      args: set of arguments to be passed.
   717  
   718    Returns:
   719      a tuple where first item gives the set of arguments defined and parsed
   720      within this method and second item gives the set of unknown arguments.
   721    """
   722  
   723    parser = argparse.ArgumentParser()
   724    parser.add_argument(
   725        '--steps',
   726        dest='steps',
   727        type=_parse_steps,
   728        help='A JSON string that gives a list where each entry of the list is '
   729        'configuration information for a step. Configuration for each step '
   730        'consists of '
   731        '(1) A float "per_bundle_delay_sec" (in seconds). Defaults to 0.'
   732        '(2) A float "per_element_delay_msec" (in milli seconds). '
   733        '    Defaults to 0.'
   734        '(3) An integer "output_records_per_input_record". Defaults to 1.'
   735        '(4) A float "output_filter_ratio" in the range [0, 1] . '
   736        '    Defaults to 0.'
   737        '(5) A bool "splittable" that defaults to false.'
   738        '(6) An integer "initial_splitting_num_bundles". Defaults to 8.')
   739  
   740    parser.add_argument(
   741        '--input',
   742        dest='input',
   743        type=json.loads,
   744        help='A JSON string that describes the properties of the SyntheticSource '
   745        'used by the pipeline. Configuration is similar to Java '
   746        'SyntheticBoundedInput.'
   747        'Currently supports following properties. '
   748        '(1) An integer "numRecords". '
   749        '(2) An integer "keySize". '
   750        '(3) An integer "valueSize". '
   751        '(4) A tuple "bundleSizeDistribution" with following values. '
   752        '    A string "type". Allowed values are "const" and "zipf". '
   753        '    An float "param". Only used if "type"=="zipf". Must be '
   754        '    larger than 1. '
   755        '(5) An integer "forceNumInitialBundles". '
   756        '(6) An integer "splitPointFrequencyRecords". '
   757        '(7) A tuple "delayDistribution" with following values. '
   758        '    A string "type". Only allowed value is "const". '
   759        '    An integer "const". ')
   760  
   761    parser.add_argument(
   762        '--barrier',
   763        dest='barrier',
   764        default='shuffle',
   765        choices=[
   766            'shuffle',
   767            'side-input',
   768            'expand-gbk',
   769            'expand-second-output',
   770            'merge-gbk',
   771            'merge-side-input'
   772        ],
   773        help='Whether to use shuffle as the barrier '
   774        '(as opposed to side inputs).')
   775    parser.add_argument(
   776        '--output',
   777        dest='output',
   778        default='',
   779        help='Destination to write output.')
   780  
   781    return parser.parse_known_args(args)
   782  
   783  
   784  def run(argv=None, save_main_session=True):
   785    """Runs the workflow."""
   786    known_args, pipeline_args = parse_args(argv)
   787  
   788    pipeline_options = PipelineOptions(pipeline_args)
   789    pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
   790  
   791    input_info = known_args.input
   792  
   793    with TestPipeline(options=pipeline_options) as p:
   794      source = SyntheticSource(input_info)
   795  
   796      # pylint: disable=expression-not-assigned
   797      barrier = known_args.barrier
   798  
   799      pc_list = []
   800      num_roots = 2**(len(known_args.steps) - 1) if (
   801          barrier == 'merge-gbk' or barrier == 'merge-side-input') else 1
   802      for read_no in range(num_roots):
   803        pc_list.append((p | ('Read %d' % read_no) >> beam.io.Read(source)))
   804  
   805      for step_no, steps in enumerate(known_args.steps):
   806        if step_no != 0:
   807          new_pc_list = []
   808          for pc_no, pc in enumerate(pc_list):
   809            if barrier == 'shuffle':
   810              new_pc_list.append(
   811                  (pc | ('shuffle %d.%d' % (step_no, pc_no)) >> ShuffleBarrier()))
   812            elif barrier == 'side-input':
   813              new_pc_list.append((
   814                  pc | ('side-input %d.%d' %
   815                        (step_no, pc_no)) >> SideInputBarrier()))
   816            elif barrier == 'expand-gbk':
   817              new_pc_list.extend(
   818                  expand_using_gbk(('expand-gbk %d.%d' % (step_no, pc_no)), pc))
   819            elif barrier == 'expand-second-output':
   820              new_pc_list.extend(
   821                  expand_using_second_output(
   822                      ('expand-second-output %d.%d' % (step_no, pc_no)), pc))
   823            elif barrier == 'merge-gbk':
   824              if pc_no % 2 == 0:
   825                new_pc_list.append(
   826                    merge_using_gbk(('merge-gbk %d.%d' % (step_no, pc_no)),
   827                                    pc,
   828                                    pc_list[pc_no + 1]))
   829              else:
   830                continue
   831            elif barrier == 'merge-side-input':
   832              if pc_no % 2 == 0:
   833                new_pc_list.append(
   834                    merge_using_side_input(
   835                        ('merge-side-input %d.%d' % (step_no, pc_no)),
   836                        pc,
   837                        pc_list[pc_no + 1]))
   838              else:
   839                continue
   840  
   841          pc_list = new_pc_list
   842  
   843        new_pc_list = []
   844        for pc_no, pc in enumerate(pc_list):
   845          if steps['splittable']:
   846            step = get_synthetic_sdf_step(
   847                per_element_delay_sec=steps['per_element_delay'],
   848                per_bundle_delay_sec=steps['per_bundle_delay'],
   849                output_records_per_input_record=steps[
   850                    'output_records_per_input_record'],
   851                output_filter_ratio=steps['output_filter_ratio'],
   852                initial_splitting_num_bundles=steps[
   853                    'initial_splitting_num_bundles'],
   854                initial_splitting_uneven_chunks=steps[
   855                    'initial_splitting_uneven_chunks'],
   856                disable_liquid_sharding=steps['disable_liquid_sharding'],
   857                size_estimate_override=steps['size_estimate_override'])
   858          else:
   859            step = SyntheticStep(
   860                per_element_delay_sec=steps['per_element_delay'],
   861                per_bundle_delay_sec=steps['per_bundle_delay'],
   862                output_records_per_input_record=steps[
   863                    'output_records_per_input_record'],
   864                output_filter_ratio=steps['output_filter_ratio'])
   865          new_pc = pc | 'SyntheticStep %d.%d' % (step_no,
   866                                                 pc_no) >> beam.ParDo(step)
   867          new_pc_list.append(new_pc)
   868        pc_list = new_pc_list
   869  
   870      if known_args.output:
   871        # If an output location is provided we format and write output.
   872        if len(pc_list) == 1:
   873          (
   874              pc_list[0]
   875              | 'FormatOutput' >> beam.Map(lambda elm: (elm[0] + elm[1]))
   876              | 'WriteOutput' >> WriteToText(known_args.output))
   877  
   878    logging.info('Pipeline run completed.')
   879  
   880  
   881  if __name__ == '__main__':
   882    logging.getLogger().setLevel(logging.INFO)
   883    run()
   884  
   885  
   886  class StatefulLoadGenerator(beam.PTransform):
   887    """A PTransform for generating random data using Timers API."""
   888    def __init__(self, input_options, num_keys=100):
   889      self.num_records = input_options['num_records']
   890      self.key_size = input_options['key_size']
   891      self.value_size = input_options['value_size']
   892      self.num_keys = num_keys
   893  
   894    @typehints.with_output_types(Tuple[bytes, bytes])
   895    class GenerateKeys(beam.DoFn):
   896      def __init__(self, num_keys, key_size):
   897        self.num_keys = num_keys
   898        self.key_size = key_size
   899  
   900      def process(self, impulse):
   901        for _ in range(self.num_keys):
   902          key = os.urandom(self.key_size)
   903          yield key, b''
   904  
   905    class GenerateLoad(beam.DoFn):
   906      state_spec = userstate.CombiningValueStateSpec(
   907          'bundles_remaining', combine_fn=sum)
   908      timer_spec = userstate.TimerSpec('timer', userstate.TimeDomain.WATERMARK)
   909  
   910      def __init__(self, num_records_per_key, value_size, bundle_size=1000):
   911        self.num_records_per_key = num_records_per_key
   912        self.payload = os.urandom(value_size)
   913        self.bundle_size = bundle_size
   914        self.time_fn = time.time
   915  
   916      def process(
   917          self,
   918          _element,
   919          records_remaining=beam.DoFn.StateParam(state_spec),
   920          timer=beam.DoFn.TimerParam(timer_spec)):
   921        records_remaining.add(self.num_records_per_key)
   922        timer.set(0)
   923  
   924      @userstate.on_timer(timer_spec)
   925      def process_timer(
   926          self,
   927          key=beam.DoFn.KeyParam,
   928          records_remaining=beam.DoFn.StateParam(state_spec),
   929          timer=beam.DoFn.TimerParam(timer_spec)):
   930        cur_bundle_size = min(self.bundle_size, records_remaining.read())
   931        for _ in range(cur_bundle_size):
   932          records_remaining.add(-1)
   933          yield key, self.payload
   934        if records_remaining.read() > 0:
   935          timer.set(0)
   936  
   937    def expand(self, pbegin):
   938      assert isinstance(pbegin, pvalue.PBegin), (
   939          'Input to transform must be a PBegin but found %s' % pbegin)
   940      return (
   941          pbegin
   942          | 'Impulse' >> beam.Impulse()
   943          | 'GenerateKeys' >> beam.ParDo(
   944              StatefulLoadGenerator.GenerateKeys(self.num_keys, self.key_size))
   945          | 'GenerateLoad' >> beam.ParDo(
   946              StatefulLoadGenerator.GenerateLoad(
   947                  self.num_records // self.num_keys, self.value_size)))