github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/cookbook/datastore_wordcount.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/cookbook/datastore_wordcount.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """A word-counting workflow that uses Google Cloud Datastore.
    19  
    20  This example shows how to use ``datastoreio`` to read from and write to
    21  Google Cloud Datastore. Note that running this example may incur charge for
    22  Cloud Datastore operations.
    23  
    24  See https://developers.google.com/datastore/ for more details on Google Cloud
    25  Datastore.
    26  See https://beam.apache.org/get-started/quickstart on
    27  how to run a Beam pipeline.
    28  
    29  Read-only Mode: In this mode, this example reads Cloud Datastore entities using
    30  the ``datastoreio.ReadFromDatastore`` transform, extracts the words,
    31  counts them and write the output to a set of files.
    32  
    33  The following options must be provided to run this pipeline in read-only mode:
    34  ``
    35  --project GCP_PROJECT
    36  --kind YOUR_DATASTORE_KIND
    37  --output [YOUR_LOCAL_FILE *or* gs://YOUR_OUTPUT_PATH]
    38  --read_only
    39  ``
    40  
    41  Read-write Mode: In this mode, this example reads words from an input file,
    42  converts them to Beam ``Entity`` objects and writes them to Cloud Datastore
    43  using the ``datastoreio.WriteToDatastore`` transform. The second pipeline
    44  will then read these Cloud Datastore entities using the
    45  ``datastoreio.ReadFromDatastore`` transform, extract the words, count them and
    46  write the output to a set of files.
    47  
    48  The following options must be provided to run this pipeline in read-write mode:
    49  ``
    50  --project GCP_PROJECT
    51  --kind YOUR_DATASTORE_KIND
    52  --output [YOUR_LOCAL_FILE *or* gs://YOUR_OUTPUT_PATH]
    53  ``
    54  """
    55  
    56  # pytype: skip-file
    57  
    58  import argparse
    59  import logging
    60  import re
    61  import sys
    62  from typing import Iterable
    63  from typing import Optional
    64  from typing import Text
    65  import uuid
    66  
    67  import apache_beam as beam
    68  from apache_beam.io import ReadFromText
    69  from apache_beam.io.gcp.datastore.v1new.datastoreio import ReadFromDatastore
    70  from apache_beam.io.gcp.datastore.v1new.datastoreio import WriteToDatastore
    71  from apache_beam.io.gcp.datastore.v1new.types import Entity
    72  from apache_beam.io.gcp.datastore.v1new.types import Key
    73  from apache_beam.io.gcp.datastore.v1new.types import Query
    74  from apache_beam.metrics import Metrics
    75  from apache_beam.metrics.metric import MetricsFilter
    76  from apache_beam.options.pipeline_options import GoogleCloudOptions
    77  from apache_beam.options.pipeline_options import PipelineOptions
    78  
    79  
    80  @beam.typehints.with_input_types(Entity)
    81  @beam.typehints.with_output_types(Text)
    82  class WordExtractingDoFn(beam.DoFn):
    83    """Parse each line of input text into words."""
    84    def __init__(self):
    85      self.empty_line_counter = Metrics.counter('main', 'empty_lines')
    86      self.word_length_counter = Metrics.counter('main', 'word_lengths')
    87      self.word_counter = Metrics.counter('main', 'total_words')
    88      self.word_lengths_dist = Metrics.distribution('main', 'word_len_dist')
    89  
    90    def process(self, element):
    91      # type: (Entity) -> Optional[Iterable[Text]]
    92  
    93      """Extract words from the 'content' property of Cloud Datastore entities.
    94  
    95      The element is a line of text.  If the line is blank, note that, too.
    96      Args:
    97        element: the input entity to be processed
    98      Returns:
    99        A list of words found.
   100      """
   101      text_line = element.properties.get('content', '')
   102      if not text_line:
   103        self.empty_line_counter.inc()
   104        return None
   105  
   106      words = re.findall(r'[A-Za-z\']+', text_line)
   107      for w in words:
   108        self.word_length_counter.inc(len(w))
   109        self.word_lengths_dist.update(len(w))
   110        self.word_counter.inc()
   111      return words
   112  
   113  
   114  class EntityWrapper(object):
   115    """Create a Cloud Datastore entity from the given string."""
   116    def __init__(self, project, namespace, kind, ancestor):
   117      self._project = project
   118      self._namespace = namespace
   119      self._kind = kind
   120      self._ancestor = ancestor
   121  
   122    def make_entity(self, content):
   123      ancestor_key = Key([self._kind, self._ancestor],
   124                         namespace=self._namespace,
   125                         project=self._project)
   126      # Namespace and project are inherited from parent key.
   127      key = Key([self._kind, str(uuid.uuid4())], parent=ancestor_key)
   128      entity = Entity(key)
   129      entity.set_properties({'content': content})
   130      return entity
   131  
   132  
   133  def write_to_datastore(project, user_options, pipeline_options):
   134    """Creates a pipeline that writes entities to Cloud Datastore."""
   135    with beam.Pipeline(options=pipeline_options) as p:
   136      _ = (
   137          p
   138          | 'read' >> ReadFromText(user_options.input)
   139          | 'create entity' >> beam.Map(
   140              EntityWrapper(
   141                  project,
   142                  user_options.namespace,
   143                  user_options.kind,
   144                  user_options.ancestor).make_entity)
   145          | 'write to datastore' >> WriteToDatastore(project))
   146  
   147  
   148  def make_ancestor_query(project, kind, namespace, ancestor):
   149    """Creates a Cloud Datastore ancestor query.
   150  
   151    The returned query will fetch all the entities that have the parent key name
   152    set to the given `ancestor`.
   153    """
   154    ancestor_key = Key([kind, ancestor], project=project, namespace=namespace)
   155    return Query(kind, project, namespace, ancestor_key)
   156  
   157  
   158  def read_from_datastore(project, user_options, pipeline_options):
   159    """Creates a pipeline that reads entities from Cloud Datastore."""
   160    p = beam.Pipeline(options=pipeline_options)
   161    # Create a query to read entities from datastore.
   162    query = make_ancestor_query(
   163        project, user_options.kind, user_options.namespace, user_options.ancestor)
   164  
   165    # Read entities from Cloud Datastore into a PCollection.
   166    lines = p | 'read from datastore' >> ReadFromDatastore(query)
   167  
   168    # Count the occurrences of each word.
   169    def count_ones(word_ones):
   170      (word, ones) = word_ones
   171      return word, sum(ones)
   172  
   173    counts = (
   174        lines
   175        | 'split' >> beam.ParDo(WordExtractingDoFn())
   176        | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
   177        | 'group' >> beam.GroupByKey()
   178        | 'count' >> beam.Map(count_ones))
   179  
   180    # Format the counts into a PCollection of strings.
   181    def format_result(word_count):
   182      (word, count) = word_count
   183      return '%s: %s' % (word, count)
   184  
   185    output = counts | 'format' >> beam.Map(format_result)
   186  
   187    # Write the output using a "Write" transform that has side effects.
   188    # pylint: disable=expression-not-assigned
   189    output | 'write' >> beam.io.WriteToText(
   190        file_path_prefix=user_options.output, num_shards=user_options.num_shards)
   191  
   192    result = p.run()
   193    # Wait until completion, main thread would access post-completion job results.
   194    result.wait_until_finish()
   195    return result
   196  
   197  
   198  def run(argv=None):
   199    """Main entry point; defines and runs the wordcount pipeline."""
   200  
   201    parser = argparse.ArgumentParser()
   202    parser.add_argument(
   203        '--input',
   204        dest='input',
   205        default='gs://dataflow-samples/shakespeare/kinglear.txt',
   206        help='Input file to process.')
   207    parser.add_argument(
   208        '--kind', dest='kind', required=True, help='Datastore Kind')
   209    parser.add_argument(
   210        '--namespace', dest='namespace', help='Datastore Namespace')
   211    parser.add_argument(
   212        '--ancestor',
   213        dest='ancestor',
   214        default='root',
   215        help='The ancestor key name for all entities.')
   216    parser.add_argument(
   217        '--output',
   218        dest='output',
   219        required=True,
   220        help='Output file to write results to.')
   221    parser.add_argument(
   222        '--read_only',
   223        action='store_true',
   224        help='Read an existing dataset, do not write first')
   225    parser.add_argument(
   226        '--num_shards',
   227        dest='num_shards',
   228        type=int,
   229        # If the system should choose automatically.
   230        default=0,
   231        help='Number of output shards')
   232  
   233    known_args, pipeline_args = parser.parse_known_args(argv)
   234    # We use the save_main_session option because one or more DoFn's in this
   235    # workflow rely on global context (e.g., a module imported at module level).
   236    pipeline_options = PipelineOptions(pipeline_args)
   237    project = pipeline_options.view_as(GoogleCloudOptions).project
   238    if project is None:
   239      parser.print_usage()
   240      print(sys.argv[0] + ': error: argument --project is required')
   241      sys.exit(1)
   242  
   243    # Write to Datastore if `read_only` options is not specified.
   244    if not known_args.read_only:
   245      write_to_datastore(project, known_args, pipeline_options)
   246  
   247    # Read entities from Datastore.
   248    result = read_from_datastore(project, known_args, pipeline_options)
   249  
   250    empty_lines_filter = MetricsFilter().with_name('empty_lines')
   251    query_result = result.metrics().query(empty_lines_filter)
   252    if query_result['counters']:
   253      empty_lines_counter = query_result['counters'][0]
   254      logging.info('number of empty lines: %d', empty_lines_counter.committed)
   255    else:
   256      logging.warning('unable to retrieve counter metrics from runner')
   257  
   258    word_lengths_filter = MetricsFilter().with_name('word_len_dist')
   259    query_result = result.metrics().query(word_lengths_filter)
   260    if query_result['distributions']:
   261      word_lengths_dist = query_result['distributions'][0]
   262      logging.info('average word length: %d', word_lengths_dist.committed.mean)
   263    else:
   264      logging.warning('unable to retrieve distribution metrics from runner')
   265  
   266  
   267  if __name__ == '__main__':
   268    logging.getLogger().setLevel(logging.INFO)
   269    run()