github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/worker/operation_specs.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/worker/operation_specs.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Worker utilities for representing MapTasks.
    19  
    20  Each MapTask represents a sequence of ParallelInstruction(s): read from a
    21  source, write to a sink, parallel do, etc.
    22  """
    23  
    24  # pytype: skip-file
    25  
    26  import collections
    27  
    28  from apache_beam import coders
    29  
    30  
    31  def build_worker_instruction(*args):
    32    """Create an object representing a ParallelInstruction protobuf.
    33  
    34    This will be a collections.namedtuple with a custom __str__ method.
    35  
    36    Alas, this wrapper is not known to pylint, which thinks it creates
    37    constants.  You may have to put a disable=invalid-name pylint
    38    annotation on any use of this, depending on your names.
    39  
    40    Args:
    41      *args: first argument is the name of the type to create.  Should
    42        start with "Worker".  Second arguments is alist of the
    43        attributes of this object.
    44    Returns:
    45      A new class, a subclass of tuple, that represents the protobuf.
    46    """
    47    tuple_class = collections.namedtuple(*args)
    48    tuple_class.__str__ = worker_object_to_string
    49    tuple_class.__repr__ = worker_object_to_string
    50    return tuple_class
    51  
    52  
    53  def worker_printable_fields(workerproto):
    54    """Returns the interesting fields of a Worker* object."""
    55    return [
    56        '%s=%s' % (name, value)
    57        # _asdict is the only way and cannot subclass this generated class
    58        # pylint: disable=protected-access
    59        for name,
    60        value in workerproto._asdict().items()
    61        # want to output value 0 but not None nor []
    62        if (value or value == 0) and name not in (
    63            'coder',
    64            'coders',
    65            'output_coders',
    66            'elements',
    67            'combine_fn',
    68            'serialized_fn',
    69            'window_fn',
    70            'append_trailing_newlines',
    71            'strip_trailing_newlines',
    72            'compression_type',
    73            'context',
    74            'start_shuffle_position',
    75            'end_shuffle_position',
    76            'shuffle_reader_config',
    77            'shuffle_writer_config')
    78    ]
    79  
    80  
    81  def worker_object_to_string(worker_object):
    82    """Returns a string compactly representing a Worker* object."""
    83    return '%s(%s)' % (
    84        worker_object.__class__.__name__,
    85        ', '.join(worker_printable_fields(worker_object)))
    86  
    87  
    88  # All the following Worker* definitions will have these lint problems:
    89  # pylint: disable=invalid-name
    90  # pylint: disable=pointless-string-statement
    91  
    92  WorkerRead = build_worker_instruction('WorkerRead', ['source', 'output_coders'])
    93  """Worker details needed to read from a source.
    94  
    95  Attributes:
    96    source: a source object.
    97    output_coders: 1-tuple of the coder for the output.
    98  """
    99  
   100  WorkerSideInputSource = build_worker_instruction(
   101      'WorkerSideInputSource', ['source', 'tag'])
   102  """Worker details needed to read from a side input source.
   103  
   104  Attributes:
   105    source: a source object.
   106    tag: string tag for this side input.
   107  """
   108  
   109  WorkerGroupingShuffleRead = build_worker_instruction(
   110      'WorkerGroupingShuffleRead',
   111      [
   112          'start_shuffle_position',
   113          'end_shuffle_position',
   114          'shuffle_reader_config',
   115          'coder',
   116          'output_coders'
   117      ])
   118  """Worker details needed to read from a grouping shuffle source.
   119  
   120  Attributes:
   121    start_shuffle_position: An opaque string to be passed to the shuffle
   122      source to indicate where to start reading.
   123    end_shuffle_position: An opaque string to be passed to the shuffle
   124      source to indicate where to stop reading.
   125    shuffle_reader_config: An opaque string used to initialize the shuffle
   126      reader. Contains things like connection endpoints for the shuffle
   127      server appliance and various options.
   128    coder: The KV coder used to decode shuffle entries.
   129    output_coders: 1-tuple of the coder for the output.
   130  """
   131  
   132  WorkerUngroupedShuffleRead = build_worker_instruction(
   133      'WorkerUngroupedShuffleRead',
   134      [
   135          'start_shuffle_position',
   136          'end_shuffle_position',
   137          'shuffle_reader_config',
   138          'coder',
   139          'output_coders'
   140      ])
   141  """Worker details needed to read from an ungrouped shuffle source.
   142  
   143  Attributes:
   144    start_shuffle_position: An opaque string to be passed to the shuffle
   145      source to indicate where to start reading.
   146    end_shuffle_position: An opaque string to be passed to the shuffle
   147      source to indicate where to stop reading.
   148    shuffle_reader_config: An opaque string used to initialize the shuffle
   149      reader. Contains things like connection endpoints for the shuffle
   150      server appliance and various options.
   151    coder: The value coder used to decode shuffle entries.
   152  """
   153  
   154  WorkerWrite = build_worker_instruction(
   155      'WorkerWrite', ['sink', 'input', 'output_coders'])
   156  """Worker details needed to write to a sink.
   157  
   158  Attributes:
   159    sink: a sink object.
   160    input: A (producer index, output index) tuple representing the
   161      ParallelInstruction operation whose output feeds into this operation.
   162      The output index is 0 except for multi-output operations (like ParDo).
   163    output_coders: 1-tuple, coder to use to estimate bytes written.
   164  """
   165  
   166  WorkerInMemoryWrite = build_worker_instruction(
   167      'WorkerInMemoryWrite',
   168      ['output_buffer', 'write_windowed_values', 'input', 'output_coders'])
   169  """Worker details needed to write to a in-memory sink.
   170  
   171  Used only for unit testing. It makes worker tests less cluttered with code like
   172  "write to a file and then check file contents".
   173  
   174  Attributes:
   175    output_buffer: list to which output elements will be appended
   176    write_windowed_values: whether to record the entire WindowedValue outputs,
   177      or just the raw (unwindowed) value
   178    input: A (producer index, output index) tuple representing the
   179      ParallelInstruction operation whose output feeds into this operation.
   180      The output index is 0 except for multi-output operations (like ParDo).
   181    output_coders: 1-tuple, coder to use to estimate bytes written.
   182  """
   183  
   184  WorkerShuffleWrite = build_worker_instruction(
   185      'WorkerShuffleWrite',
   186      ['shuffle_kind', 'shuffle_writer_config', 'input', 'output_coders'])
   187  """Worker details needed to write to a shuffle sink.
   188  
   189  Attributes:
   190    shuffle_kind: A string describing the shuffle kind. This can control the
   191      way the worker interacts with the shuffle sink. The possible values are:
   192      'ungrouped', 'group_keys', and 'group_keys_and_sort_values'.
   193    shuffle_writer_config: An opaque string used to initialize the shuffle
   194      write. Contains things like connection endpoints for the shuffle
   195      server appliance and various options.
   196    input: A (producer index, output index) tuple representing the
   197      ParallelInstruction operation whose output feeds into this operation.
   198      The output index is 0 except for multi-output operations (like ParDo).
   199    output_coders: 1-tuple of the coder for input elements. If the
   200      shuffle_kind is grouping, this is expected to be a KV coder.
   201  """
   202  
   203  WorkerDoFn = build_worker_instruction(
   204      'WorkerDoFn',
   205      ['serialized_fn', 'output_tags', 'input', 'side_inputs', 'output_coders'])
   206  """Worker details needed to run a DoFn.
   207  Attributes:
   208    serialized_fn: A serialized DoFn object to be run for each input element.
   209    output_tags: The string tags used to identify the outputs of a ParDo
   210      operation. The tag is present even if the ParDo has just one output
   211      (e.g., ['out'].
   212    output_coders: array of coders, one for each output.
   213    input: A (producer index, output index) tuple representing the
   214      ParallelInstruction operation whose output feeds into this operation.
   215      The output index is 0 except for multi-output operations (like ParDo).
   216    side_inputs: A list of Worker...Read instances describing sources to be
   217      used for getting values. The types supported right now are
   218      WorkerInMemoryRead and WorkerTextRead.
   219  """
   220  
   221  WorkerReifyTimestampAndWindows = build_worker_instruction(
   222      'WorkerReifyTimestampAndWindows', ['output_tags', 'input', 'output_coders'])
   223  """Worker details needed to run a WindowInto.
   224  Attributes:
   225    output_tags: The string tags used to identify the outputs of a ParDo
   226      operation. The tag is present even if the ParDo has just one output
   227      (e.g., ['out'].
   228    output_coders: array of coders, one for each output.
   229    input: A (producer index, output index) tuple representing the
   230      ParallelInstruction operation whose output feeds into this operation.
   231      The output index is 0 except for multi-output operations (like ParDo).
   232  """
   233  
   234  WorkerMergeWindows = build_worker_instruction(
   235      'WorkerMergeWindows',
   236      [
   237          'window_fn',
   238          'combine_fn',
   239          'phase',
   240          'output_tags',
   241          'input',
   242          'coders',
   243          'context',
   244          'output_coders'
   245      ])
   246  """Worker details needed to run a MergeWindows (aka. GroupAlsoByWindows).
   247  Attributes:
   248    window_fn: A serialized Windowing object representing the windowing strategy.
   249    combine_fn: A serialized CombineFn object to be used after executing the
   250      GroupAlsoByWindows operation. May be None if not a combining operation.
   251    phase: Possible values are 'all', 'add', 'merge', and 'extract'.
   252      A runner optimizer may split the user combiner in 3 separate
   253      phases (ADD, MERGE, and EXTRACT), on separate VMs, as it sees
   254      fit. The phase attribute dictates which DoFn is actually running in
   255      the worker. May be None if not a combining operation.
   256    output_tags: The string tags used to identify the outputs of a ParDo
   257      operation. The tag is present even if the ParDo has just one output
   258      (e.g., ['out'].
   259    output_coders: array of coders, one for each output.
   260    input: A (producer index, output index) tuple representing the
   261      ParallelInstruction operation whose output feeds into this operation.
   262      The output index is 0 except for multi-output operations (like ParDo).
   263    coders: A 2-tuple of coders (key, value) to encode shuffle entries.
   264    context: The ExecutionContext object for the current work item.
   265  """
   266  
   267  WorkerCombineFn = build_worker_instruction(
   268      'WorkerCombineFn', ['serialized_fn', 'phase', 'input', 'output_coders'])
   269  """Worker details needed to run a CombineFn.
   270  Attributes:
   271    serialized_fn: A serialized CombineFn object to be used.
   272    phase: Possible values are 'all', 'add', 'merge', and 'extract'.
   273      A runner optimizer may split the user combiner in 3 separate
   274      phases (ADD, MERGE, and EXTRACT), on separate VMs, as it sees
   275      fit. The phase attribute dictates which DoFn is actually running in
   276      the worker.
   277    input: A (producer index, output index) tuple representing the
   278      ParallelInstruction operation whose output feeds into this operation.
   279      The output index is 0 except for multi-output operations (like ParDo).
   280    output_coders: 1-tuple of the coder for the output.
   281  """
   282  
   283  WorkerPartialGroupByKey = build_worker_instruction(
   284      'WorkerPartialGroupByKey', ['combine_fn', 'input', 'output_coders'])
   285  """Worker details needed to run a partial group-by-key.
   286  Attributes:
   287    combine_fn: A serialized CombineFn object to be used.
   288    input: A (producer index, output index) tuple representing the
   289      ParallelInstruction operation whose output feeds into this operation.
   290      The output index is 0 except for multi-output operations (like ParDo).
   291    output_coders: 1-tuple of the coder for the output.
   292  """
   293  
   294  WorkerFlatten = build_worker_instruction(
   295      'WorkerFlatten', ['inputs', 'output_coders'])
   296  """Worker details needed to run a Flatten.
   297  Attributes:
   298    inputs: A list of tuples, each (producer index, output index), representing
   299      the ParallelInstruction operations whose output feeds into this operation.
   300      The output index is 0 unless the input is from a multi-output
   301      operation (such as ParDo).
   302    output_coders: 1-tuple of the coder for the output.
   303  """
   304  
   305  
   306  def get_coder_from_spec(coder_spec):
   307    """Return a coder instance from a coder spec.
   308  
   309    Args:
   310      coder_spec: A dict where the value of the '@type' key is a pickled instance
   311        of a Coder instance.
   312  
   313    Returns:
   314      A coder instance (has encode/decode methods).
   315    """
   316    assert coder_spec is not None
   317  
   318    # Ignore the wrappers in these encodings.
   319    ignored_wrappers = (
   320        'com.google.cloud.dataflow.sdk.util.TimerOrElement$TimerOrElementCoder')
   321    if coder_spec['@type'] in ignored_wrappers:
   322      assert len(coder_spec['component_encodings']) == 1
   323      coder_spec = coder_spec['component_encodings'][0]
   324      return get_coder_from_spec(coder_spec)
   325  
   326    # Handle a few well known types of coders.
   327    if coder_spec['@type'] == 'kind:pair':
   328      assert len(coder_spec['component_encodings']) == 2
   329      component_coders = [
   330          get_coder_from_spec(c) for c in coder_spec['component_encodings']
   331      ]
   332      return coders.TupleCoder(component_coders)
   333    elif coder_spec['@type'] == 'kind:stream':
   334      assert len(coder_spec['component_encodings']) == 1
   335      return coders.IterableCoder(
   336          get_coder_from_spec(coder_spec['component_encodings'][0]))
   337    elif coder_spec['@type'] == 'kind:windowed_value':
   338      assert len(coder_spec['component_encodings']) == 2
   339      value_coder, window_coder = [
   340          get_coder_from_spec(c) for c in coder_spec['component_encodings']]
   341      return coders.coders.WindowedValueCoder(
   342          value_coder, window_coder=window_coder)
   343    elif coder_spec['@type'] == 'kind:interval_window':
   344      assert (
   345          'component_encodings' not in coder_spec or
   346          not coder_spec['component_encodings'])
   347      return coders.coders.IntervalWindowCoder()
   348    elif coder_spec['@type'] == 'kind:global_window':
   349      assert (
   350          'component_encodings' not in coder_spec or
   351          not coder_spec['component_encodings'])
   352      return coders.coders.GlobalWindowCoder()
   353    elif coder_spec['@type'] == 'kind:varint':
   354      assert (
   355          'component_encodings' not in coder_spec or
   356          len(coder_spec['component_encodings'] == 0))
   357      return coders.coders.VarIntCoder()
   358    elif coder_spec['@type'] == 'kind:length_prefix':
   359      assert len(coder_spec['component_encodings']) == 1
   360      return coders.coders.LengthPrefixCoder(
   361          get_coder_from_spec(coder_spec['component_encodings'][0]))
   362    elif coder_spec['@type'] == 'kind:bytes':
   363      assert (
   364          'component_encodings' not in coder_spec or
   365          len(coder_spec['component_encodings'] == 0))
   366      return coders.BytesCoder()
   367  
   368    # We pass coders in the form "<coder_name>$<pickled_data>" to make the job
   369    # description JSON more readable.
   370    return coders.coders.deserialize_coder(coder_spec['@type'].encode('ascii'))