github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/cookbook/bigquery_side_input.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/cookbook/bigquery_side_input.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """A Dataflow job that uses BigQuery sources as a side inputs.
    19  
    20  Illustrates how to insert side-inputs into transforms in three different forms,
    21  as a singleton, as a iterator, and as a list.
    22  
    23  This workflow generate a set of tuples of the form (groupId, corpus, word) where
    24  groupId is a generated identifier for the group and corpus and word are randomly
    25  selected from corresponding rows in BQ dataset 'publicdata:samples.shakespeare'.
    26  Users should specify the number of groups to form and optionally a corpus and/or
    27  a word that should be ignored when forming groups.
    28  """
    29  
    30  # pytype: skip-file
    31  
    32  import argparse
    33  import logging
    34  from random import randrange
    35  
    36  import apache_beam as beam
    37  from apache_beam.io import WriteToText
    38  from apache_beam.options.pipeline_options import PipelineOptions
    39  from apache_beam.options.pipeline_options import SetupOptions
    40  from apache_beam.pvalue import AsList
    41  from apache_beam.pvalue import AsSingleton
    42  
    43  
    44  def create_groups(group_ids, corpus, word, ignore_corpus, ignore_word):
    45    """Generate groups given the input PCollections."""
    46    def attach_corpus_fn(group, corpus, ignore):
    47      selected = None
    48      len_corpus = len(corpus)
    49      while not selected:
    50        c = list(corpus[randrange(0, len_corpus)].values())[0]
    51        if c != ignore:
    52          selected = c
    53  
    54      yield (group, selected)
    55  
    56    def attach_word_fn(group, words, ignore):
    57      selected = None
    58      len_words = len(words)
    59      while not selected:
    60        c = list(words[randrange(0, len_words)].values())[0]
    61        if c != ignore:
    62          selected = c
    63  
    64      yield group + (selected, )
    65  
    66    return (
    67        group_ids
    68        | 'attach corpus' >> beam.FlatMap(
    69            attach_corpus_fn, AsList(corpus), AsSingleton(ignore_corpus))
    70        | 'attach word' >> beam.FlatMap(
    71            attach_word_fn, AsList(word), AsSingleton(ignore_word)))
    72  
    73  
    74  def run(argv=None):
    75    """Run the workflow."""
    76  
    77    parser = argparse.ArgumentParser()
    78    parser.add_argument('--output')
    79    parser.add_argument('--ignore_corpus', default='')
    80    parser.add_argument('--ignore_word', default='')
    81    parser.add_argument('--num_groups')
    82  
    83    known_args, pipeline_args = parser.parse_known_args(argv)
    84    # We use the save_main_session option because one or more DoFn's in this
    85    # workflow rely on global context (e.g., a module imported at module level).
    86    pipeline_options = PipelineOptions(pipeline_args)
    87    pipeline_options.view_as(SetupOptions).save_main_session = True
    88    with beam.Pipeline(options=pipeline_options) as p:
    89  
    90      group_ids = []
    91      for i in range(0, int(known_args.num_groups)):
    92        group_ids.append('id' + str(i))
    93  
    94      query_corpus = 'select UNIQUE(corpus) from publicdata:samples.shakespeare'
    95      query_word = 'select UNIQUE(word) from publicdata:samples.shakespeare'
    96      ignore_corpus = known_args.ignore_corpus
    97      ignore_word = known_args.ignore_word
    98  
    99      pcoll_corpus = p | 'read corpus' >> beam.io.ReadFromBigQuery(
   100          query=query_corpus)
   101      pcoll_word = p | 'read_words' >> beam.io.ReadFromBigQuery(query=query_word)
   102      pcoll_ignore_corpus = p | 'create_ignore_corpus' >> beam.Create(
   103          [ignore_corpus])
   104      pcoll_ignore_word = p | 'create_ignore_word' >> beam.Create([ignore_word])
   105      pcoll_group_ids = p | 'create groups' >> beam.Create(group_ids)
   106  
   107      pcoll_groups = create_groups(
   108          pcoll_group_ids,
   109          pcoll_corpus,
   110          pcoll_word,
   111          pcoll_ignore_corpus,
   112          pcoll_ignore_word)
   113  
   114      # pylint:disable=expression-not-assigned
   115      pcoll_groups | WriteToText(known_args.output)
   116  
   117  
   118  if __name__ == '__main__':
   119    logging.getLogger().setLevel(logging.INFO)
   120    run()