github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/cookbook/bigquery_side_input.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """A Dataflow job that uses BigQuery sources as a side inputs. 19 20 Illustrates how to insert side-inputs into transforms in three different forms, 21 as a singleton, as a iterator, and as a list. 22 23 This workflow generate a set of tuples of the form (groupId, corpus, word) where 24 groupId is a generated identifier for the group and corpus and word are randomly 25 selected from corresponding rows in BQ dataset 'publicdata:samples.shakespeare'. 26 Users should specify the number of groups to form and optionally a corpus and/or 27 a word that should be ignored when forming groups. 28 """ 29 30 # pytype: skip-file 31 32 import argparse 33 import logging 34 from random import randrange 35 36 import apache_beam as beam 37 from apache_beam.io import WriteToText 38 from apache_beam.options.pipeline_options import PipelineOptions 39 from apache_beam.options.pipeline_options import SetupOptions 40 from apache_beam.pvalue import AsList 41 from apache_beam.pvalue import AsSingleton 42 43 44 def create_groups(group_ids, corpus, word, ignore_corpus, ignore_word): 45 """Generate groups given the input PCollections.""" 46 def attach_corpus_fn(group, corpus, ignore): 47 selected = None 48 len_corpus = len(corpus) 49 while not selected: 50 c = list(corpus[randrange(0, len_corpus)].values())[0] 51 if c != ignore: 52 selected = c 53 54 yield (group, selected) 55 56 def attach_word_fn(group, words, ignore): 57 selected = None 58 len_words = len(words) 59 while not selected: 60 c = list(words[randrange(0, len_words)].values())[0] 61 if c != ignore: 62 selected = c 63 64 yield group + (selected, ) 65 66 return ( 67 group_ids 68 | 'attach corpus' >> beam.FlatMap( 69 attach_corpus_fn, AsList(corpus), AsSingleton(ignore_corpus)) 70 | 'attach word' >> beam.FlatMap( 71 attach_word_fn, AsList(word), AsSingleton(ignore_word))) 72 73 74 def run(argv=None): 75 """Run the workflow.""" 76 77 parser = argparse.ArgumentParser() 78 parser.add_argument('--output') 79 parser.add_argument('--ignore_corpus', default='') 80 parser.add_argument('--ignore_word', default='') 81 parser.add_argument('--num_groups') 82 83 known_args, pipeline_args = parser.parse_known_args(argv) 84 # We use the save_main_session option because one or more DoFn's in this 85 # workflow rely on global context (e.g., a module imported at module level). 86 pipeline_options = PipelineOptions(pipeline_args) 87 pipeline_options.view_as(SetupOptions).save_main_session = True 88 with beam.Pipeline(options=pipeline_options) as p: 89 90 group_ids = [] 91 for i in range(0, int(known_args.num_groups)): 92 group_ids.append('id' + str(i)) 93 94 query_corpus = 'select UNIQUE(corpus) from publicdata:samples.shakespeare' 95 query_word = 'select UNIQUE(word) from publicdata:samples.shakespeare' 96 ignore_corpus = known_args.ignore_corpus 97 ignore_word = known_args.ignore_word 98 99 pcoll_corpus = p | 'read corpus' >> beam.io.ReadFromBigQuery( 100 query=query_corpus) 101 pcoll_word = p | 'read_words' >> beam.io.ReadFromBigQuery(query=query_word) 102 pcoll_ignore_corpus = p | 'create_ignore_corpus' >> beam.Create( 103 [ignore_corpus]) 104 pcoll_ignore_word = p | 'create_ignore_word' >> beam.Create([ignore_word]) 105 pcoll_group_ids = p | 'create groups' >> beam.Create(group_ids) 106 107 pcoll_groups = create_groups( 108 pcoll_group_ids, 109 pcoll_corpus, 110 pcoll_word, 111 pcoll_ignore_corpus, 112 pcoll_ignore_word) 113 114 # pylint:disable=expression-not-assigned 115 pcoll_groups | WriteToText(known_args.output) 116 117 118 if __name__ == '__main__': 119 logging.getLogger().setLevel(logging.INFO) 120 run()