github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/complete/top_wikipedia_sessions.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/complete/top_wikipedia_sessions.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """An example that reads Wikipedia edit data and computes strings of edits.
    19  
    20  An example that reads Wikipedia edit data from Cloud Storage and computes the
    21  user with the longest string of edits separated by no more than an hour within
    22  each 30 day period.
    23  
    24  To execute this pipeline locally using the DirectRunner, specify an
    25  output prefix on GCS:::
    26  
    27    --output gs://YOUR_OUTPUT_PREFIX
    28  
    29  To execute this pipeline using the Google Cloud Dataflow service, specify
    30  pipeline configuration in addition to the above:::
    31  
    32    --job_name NAME_FOR_YOUR_JOB
    33    --project YOUR_PROJECT_ID
    34    --region GCE_REGION
    35    --staging_location gs://YOUR_STAGING_DIRECTORY
    36    --temp_location gs://YOUR_TEMPORARY_DIRECTORY
    37    --runner DataflowRunner
    38  
    39  The default input is ``gs://dataflow-samples/wikipedia_edits/*.json`` and can
    40  be overridden with --input.
    41  """
    42  
    43  # pytype: skip-file
    44  
    45  import argparse
    46  import json
    47  import logging
    48  
    49  import apache_beam as beam
    50  from apache_beam import combiners
    51  from apache_beam.io import ReadFromText
    52  from apache_beam.io import WriteToText
    53  from apache_beam.options.pipeline_options import PipelineOptions
    54  from apache_beam.options.pipeline_options import SetupOptions
    55  from apache_beam.transforms.window import FixedWindows
    56  from apache_beam.transforms.window import Sessions
    57  from apache_beam.transforms.window import TimestampedValue
    58  
    59  ONE_HOUR_IN_SECONDS = 3600
    60  THIRTY_DAYS_IN_SECONDS = 30 * 24 * ONE_HOUR_IN_SECONDS
    61  MAX_TIMESTAMP = 0x7fffffffffffffff
    62  
    63  
    64  class ExtractUserAndTimestampDoFn(beam.DoFn):
    65    """Extracts user and timestamp representing a Wikipedia edit."""
    66    def process(self, element):
    67      table_row = json.loads(element)
    68      if 'contributor_username' in table_row:
    69        user_name = table_row['contributor_username']
    70        timestamp = table_row['timestamp']
    71        yield TimestampedValue(user_name, timestamp)
    72  
    73  
    74  class ComputeSessions(beam.PTransform):
    75    """Computes the number of edits in each user session.
    76  
    77    A session is defined as a string of edits where each is separated from the
    78    next by less than an hour.
    79    """
    80    def expand(self, pcoll):
    81      return (
    82          pcoll
    83          | 'ComputeSessionsWindow' >> beam.WindowInto(
    84              Sessions(gap_size=ONE_HOUR_IN_SECONDS))
    85          | combiners.Count.PerElement())
    86  
    87  
    88  class TopPerMonth(beam.PTransform):
    89    """Computes the longest session ending in each month."""
    90    def expand(self, pcoll):
    91      return (
    92          pcoll
    93          | 'TopPerMonthWindow' >> beam.WindowInto(
    94              FixedWindows(size=THIRTY_DAYS_IN_SECONDS))
    95          | 'Top' >> combiners.core.CombineGlobally(
    96              combiners.TopCombineFn(
    97                  n=10, key=lambda sessions_count: sessions_count[1])).
    98          without_defaults())
    99  
   100  
   101  class SessionsToStringsDoFn(beam.DoFn):
   102    """Adds the session information to be part of the key."""
   103    def process(self, element, window=beam.DoFn.WindowParam):
   104      yield (element[0] + ' : ' + str(window), element[1])
   105  
   106  
   107  class FormatOutputDoFn(beam.DoFn):
   108    """Formats a string containing the user, count, and session."""
   109    def process(self, element, window=beam.DoFn.WindowParam):
   110      for kv in element:
   111        session = kv[0]
   112        count = kv[1]
   113        yield session + ' : ' + str(count) + ' : ' + str(window)
   114  
   115  
   116  class ComputeTopSessions(beam.PTransform):
   117    """Computes the top user sessions for each month."""
   118    def __init__(self, sampling_threshold):
   119      # TODO(BEAM-6158): Revert the workaround once we can pickle super() on py3.
   120      # super().__init__()
   121      beam.PTransform.__init__(self)
   122      self.sampling_threshold = sampling_threshold
   123  
   124    def expand(self, pcoll):
   125      return (
   126          pcoll
   127          |
   128          'ExtractUserAndTimestamp' >> beam.ParDo(ExtractUserAndTimestampDoFn())
   129          | beam.Filter(
   130              lambda x: (abs(hash(x)) <= MAX_TIMESTAMP * self.sampling_threshold))
   131          | ComputeSessions()
   132          | 'SessionsToStrings' >> beam.ParDo(SessionsToStringsDoFn())
   133          | TopPerMonth()
   134          | 'FormatOutput' >> beam.ParDo(FormatOutputDoFn()))
   135  
   136  
   137  def run(argv=None):
   138    """Runs the Wikipedia top edits pipeline.
   139  
   140    Args:
   141      argv: Pipeline options as a list of arguments.
   142    """
   143  
   144    parser = argparse.ArgumentParser()
   145    parser.add_argument(
   146        '--input',
   147        dest='input',
   148        default='gs://dataflow-samples/wikipedia_edits/*.json',
   149        help='Input specified as a GCS path containing a BigQuery table exported '
   150        'as json.')
   151    parser.add_argument(
   152        '--output', required=True, help='Output file to write results to.')
   153    parser.add_argument(
   154        '--sampling_threshold',
   155        type=float,
   156        default=0.1,
   157        help='Fraction of entries used for session tracking')
   158    known_args, pipeline_args = parser.parse_known_args(argv)
   159    # We use the save_main_session option because one or more DoFn's in this
   160    # workflow rely on global context (e.g., a module imported at module level).
   161    pipeline_options = PipelineOptions(pipeline_args)
   162    pipeline_options.view_as(SetupOptions).save_main_session = True
   163    with beam.Pipeline(options=pipeline_options) as p:
   164  
   165      (  # pylint: disable=expression-not-assigned
   166          p
   167          | ReadFromText(known_args.input)
   168          | ComputeTopSessions(known_args.sampling_threshold)
   169          | WriteToText(known_args.output))
   170  
   171  
   172  if __name__ == '__main__':
   173    logging.getLogger().setLevel(logging.INFO)
   174    run()