github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/cookbook/datastore_wordcount.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """A word-counting workflow that uses Google Cloud Datastore. 19 20 This example shows how to use ``datastoreio`` to read from and write to 21 Google Cloud Datastore. Note that running this example may incur charge for 22 Cloud Datastore operations. 23 24 See https://developers.google.com/datastore/ for more details on Google Cloud 25 Datastore. 26 See https://beam.apache.org/get-started/quickstart on 27 how to run a Beam pipeline. 28 29 Read-only Mode: In this mode, this example reads Cloud Datastore entities using 30 the ``datastoreio.ReadFromDatastore`` transform, extracts the words, 31 counts them and write the output to a set of files. 32 33 The following options must be provided to run this pipeline in read-only mode: 34 `` 35 --project GCP_PROJECT 36 --kind YOUR_DATASTORE_KIND 37 --output [YOUR_LOCAL_FILE *or* gs://YOUR_OUTPUT_PATH] 38 --read_only 39 `` 40 41 Read-write Mode: In this mode, this example reads words from an input file, 42 converts them to Beam ``Entity`` objects and writes them to Cloud Datastore 43 using the ``datastoreio.WriteToDatastore`` transform. The second pipeline 44 will then read these Cloud Datastore entities using the 45 ``datastoreio.ReadFromDatastore`` transform, extract the words, count them and 46 write the output to a set of files. 47 48 The following options must be provided to run this pipeline in read-write mode: 49 `` 50 --project GCP_PROJECT 51 --kind YOUR_DATASTORE_KIND 52 --output [YOUR_LOCAL_FILE *or* gs://YOUR_OUTPUT_PATH] 53 `` 54 """ 55 56 # pytype: skip-file 57 58 import argparse 59 import logging 60 import re 61 import sys 62 from typing import Iterable 63 from typing import Optional 64 from typing import Text 65 import uuid 66 67 import apache_beam as beam 68 from apache_beam.io import ReadFromText 69 from apache_beam.io.gcp.datastore.v1new.datastoreio import ReadFromDatastore 70 from apache_beam.io.gcp.datastore.v1new.datastoreio import WriteToDatastore 71 from apache_beam.io.gcp.datastore.v1new.types import Entity 72 from apache_beam.io.gcp.datastore.v1new.types import Key 73 from apache_beam.io.gcp.datastore.v1new.types import Query 74 from apache_beam.metrics import Metrics 75 from apache_beam.metrics.metric import MetricsFilter 76 from apache_beam.options.pipeline_options import GoogleCloudOptions 77 from apache_beam.options.pipeline_options import PipelineOptions 78 79 80 @beam.typehints.with_input_types(Entity) 81 @beam.typehints.with_output_types(Text) 82 class WordExtractingDoFn(beam.DoFn): 83 """Parse each line of input text into words.""" 84 def __init__(self): 85 self.empty_line_counter = Metrics.counter('main', 'empty_lines') 86 self.word_length_counter = Metrics.counter('main', 'word_lengths') 87 self.word_counter = Metrics.counter('main', 'total_words') 88 self.word_lengths_dist = Metrics.distribution('main', 'word_len_dist') 89 90 def process(self, element): 91 # type: (Entity) -> Optional[Iterable[Text]] 92 93 """Extract words from the 'content' property of Cloud Datastore entities. 94 95 The element is a line of text. If the line is blank, note that, too. 96 Args: 97 element: the input entity to be processed 98 Returns: 99 A list of words found. 100 """ 101 text_line = element.properties.get('content', '') 102 if not text_line: 103 self.empty_line_counter.inc() 104 return None 105 106 words = re.findall(r'[A-Za-z\']+', text_line) 107 for w in words: 108 self.word_length_counter.inc(len(w)) 109 self.word_lengths_dist.update(len(w)) 110 self.word_counter.inc() 111 return words 112 113 114 class EntityWrapper(object): 115 """Create a Cloud Datastore entity from the given string.""" 116 def __init__(self, project, namespace, kind, ancestor): 117 self._project = project 118 self._namespace = namespace 119 self._kind = kind 120 self._ancestor = ancestor 121 122 def make_entity(self, content): 123 ancestor_key = Key([self._kind, self._ancestor], 124 namespace=self._namespace, 125 project=self._project) 126 # Namespace and project are inherited from parent key. 127 key = Key([self._kind, str(uuid.uuid4())], parent=ancestor_key) 128 entity = Entity(key) 129 entity.set_properties({'content': content}) 130 return entity 131 132 133 def write_to_datastore(project, user_options, pipeline_options): 134 """Creates a pipeline that writes entities to Cloud Datastore.""" 135 with beam.Pipeline(options=pipeline_options) as p: 136 _ = ( 137 p 138 | 'read' >> ReadFromText(user_options.input) 139 | 'create entity' >> beam.Map( 140 EntityWrapper( 141 project, 142 user_options.namespace, 143 user_options.kind, 144 user_options.ancestor).make_entity) 145 | 'write to datastore' >> WriteToDatastore(project)) 146 147 148 def make_ancestor_query(project, kind, namespace, ancestor): 149 """Creates a Cloud Datastore ancestor query. 150 151 The returned query will fetch all the entities that have the parent key name 152 set to the given `ancestor`. 153 """ 154 ancestor_key = Key([kind, ancestor], project=project, namespace=namespace) 155 return Query(kind, project, namespace, ancestor_key) 156 157 158 def read_from_datastore(project, user_options, pipeline_options): 159 """Creates a pipeline that reads entities from Cloud Datastore.""" 160 p = beam.Pipeline(options=pipeline_options) 161 # Create a query to read entities from datastore. 162 query = make_ancestor_query( 163 project, user_options.kind, user_options.namespace, user_options.ancestor) 164 165 # Read entities from Cloud Datastore into a PCollection. 166 lines = p | 'read from datastore' >> ReadFromDatastore(query) 167 168 # Count the occurrences of each word. 169 def count_ones(word_ones): 170 (word, ones) = word_ones 171 return word, sum(ones) 172 173 counts = ( 174 lines 175 | 'split' >> beam.ParDo(WordExtractingDoFn()) 176 | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) 177 | 'group' >> beam.GroupByKey() 178 | 'count' >> beam.Map(count_ones)) 179 180 # Format the counts into a PCollection of strings. 181 def format_result(word_count): 182 (word, count) = word_count 183 return '%s: %s' % (word, count) 184 185 output = counts | 'format' >> beam.Map(format_result) 186 187 # Write the output using a "Write" transform that has side effects. 188 # pylint: disable=expression-not-assigned 189 output | 'write' >> beam.io.WriteToText( 190 file_path_prefix=user_options.output, num_shards=user_options.num_shards) 191 192 result = p.run() 193 # Wait until completion, main thread would access post-completion job results. 194 result.wait_until_finish() 195 return result 196 197 198 def run(argv=None): 199 """Main entry point; defines and runs the wordcount pipeline.""" 200 201 parser = argparse.ArgumentParser() 202 parser.add_argument( 203 '--input', 204 dest='input', 205 default='gs://dataflow-samples/shakespeare/kinglear.txt', 206 help='Input file to process.') 207 parser.add_argument( 208 '--kind', dest='kind', required=True, help='Datastore Kind') 209 parser.add_argument( 210 '--namespace', dest='namespace', help='Datastore Namespace') 211 parser.add_argument( 212 '--ancestor', 213 dest='ancestor', 214 default='root', 215 help='The ancestor key name for all entities.') 216 parser.add_argument( 217 '--output', 218 dest='output', 219 required=True, 220 help='Output file to write results to.') 221 parser.add_argument( 222 '--read_only', 223 action='store_true', 224 help='Read an existing dataset, do not write first') 225 parser.add_argument( 226 '--num_shards', 227 dest='num_shards', 228 type=int, 229 # If the system should choose automatically. 230 default=0, 231 help='Number of output shards') 232 233 known_args, pipeline_args = parser.parse_known_args(argv) 234 # We use the save_main_session option because one or more DoFn's in this 235 # workflow rely on global context (e.g., a module imported at module level). 236 pipeline_options = PipelineOptions(pipeline_args) 237 project = pipeline_options.view_as(GoogleCloudOptions).project 238 if project is None: 239 parser.print_usage() 240 print(sys.argv[0] + ': error: argument --project is required') 241 sys.exit(1) 242 243 # Write to Datastore if `read_only` options is not specified. 244 if not known_args.read_only: 245 write_to_datastore(project, known_args, pipeline_options) 246 247 # Read entities from Datastore. 248 result = read_from_datastore(project, known_args, pipeline_options) 249 250 empty_lines_filter = MetricsFilter().with_name('empty_lines') 251 query_result = result.metrics().query(empty_lines_filter) 252 if query_result['counters']: 253 empty_lines_counter = query_result['counters'][0] 254 logging.info('number of empty lines: %d', empty_lines_counter.committed) 255 else: 256 logging.warning('unable to retrieve counter metrics from runner') 257 258 word_lengths_filter = MetricsFilter().with_name('word_len_dist') 259 query_result = result.metrics().query(word_lengths_filter) 260 if query_result['distributions']: 261 word_lengths_dist = query_result['distributions'][0] 262 logging.info('average word length: %d', word_lengths_dist.committed.mean) 263 else: 264 logging.warning('unable to retrieve distribution metrics from runner') 265 266 267 if __name__ == '__main__': 268 logging.getLogger().setLevel(logging.INFO) 269 run()