github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/complete/top_wikipedia_sessions.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """An example that reads Wikipedia edit data and computes strings of edits. 19 20 An example that reads Wikipedia edit data from Cloud Storage and computes the 21 user with the longest string of edits separated by no more than an hour within 22 each 30 day period. 23 24 To execute this pipeline locally using the DirectRunner, specify an 25 output prefix on GCS::: 26 27 --output gs://YOUR_OUTPUT_PREFIX 28 29 To execute this pipeline using the Google Cloud Dataflow service, specify 30 pipeline configuration in addition to the above::: 31 32 --job_name NAME_FOR_YOUR_JOB 33 --project YOUR_PROJECT_ID 34 --region GCE_REGION 35 --staging_location gs://YOUR_STAGING_DIRECTORY 36 --temp_location gs://YOUR_TEMPORARY_DIRECTORY 37 --runner DataflowRunner 38 39 The default input is ``gs://dataflow-samples/wikipedia_edits/*.json`` and can 40 be overridden with --input. 41 """ 42 43 # pytype: skip-file 44 45 import argparse 46 import json 47 import logging 48 49 import apache_beam as beam 50 from apache_beam import combiners 51 from apache_beam.io import ReadFromText 52 from apache_beam.io import WriteToText 53 from apache_beam.options.pipeline_options import PipelineOptions 54 from apache_beam.options.pipeline_options import SetupOptions 55 from apache_beam.transforms.window import FixedWindows 56 from apache_beam.transforms.window import Sessions 57 from apache_beam.transforms.window import TimestampedValue 58 59 ONE_HOUR_IN_SECONDS = 3600 60 THIRTY_DAYS_IN_SECONDS = 30 * 24 * ONE_HOUR_IN_SECONDS 61 MAX_TIMESTAMP = 0x7fffffffffffffff 62 63 64 class ExtractUserAndTimestampDoFn(beam.DoFn): 65 """Extracts user and timestamp representing a Wikipedia edit.""" 66 def process(self, element): 67 table_row = json.loads(element) 68 if 'contributor_username' in table_row: 69 user_name = table_row['contributor_username'] 70 timestamp = table_row['timestamp'] 71 yield TimestampedValue(user_name, timestamp) 72 73 74 class ComputeSessions(beam.PTransform): 75 """Computes the number of edits in each user session. 76 77 A session is defined as a string of edits where each is separated from the 78 next by less than an hour. 79 """ 80 def expand(self, pcoll): 81 return ( 82 pcoll 83 | 'ComputeSessionsWindow' >> beam.WindowInto( 84 Sessions(gap_size=ONE_HOUR_IN_SECONDS)) 85 | combiners.Count.PerElement()) 86 87 88 class TopPerMonth(beam.PTransform): 89 """Computes the longest session ending in each month.""" 90 def expand(self, pcoll): 91 return ( 92 pcoll 93 | 'TopPerMonthWindow' >> beam.WindowInto( 94 FixedWindows(size=THIRTY_DAYS_IN_SECONDS)) 95 | 'Top' >> combiners.core.CombineGlobally( 96 combiners.TopCombineFn( 97 n=10, key=lambda sessions_count: sessions_count[1])). 98 without_defaults()) 99 100 101 class SessionsToStringsDoFn(beam.DoFn): 102 """Adds the session information to be part of the key.""" 103 def process(self, element, window=beam.DoFn.WindowParam): 104 yield (element[0] + ' : ' + str(window), element[1]) 105 106 107 class FormatOutputDoFn(beam.DoFn): 108 """Formats a string containing the user, count, and session.""" 109 def process(self, element, window=beam.DoFn.WindowParam): 110 for kv in element: 111 session = kv[0] 112 count = kv[1] 113 yield session + ' : ' + str(count) + ' : ' + str(window) 114 115 116 class ComputeTopSessions(beam.PTransform): 117 """Computes the top user sessions for each month.""" 118 def __init__(self, sampling_threshold): 119 # TODO(BEAM-6158): Revert the workaround once we can pickle super() on py3. 120 # super().__init__() 121 beam.PTransform.__init__(self) 122 self.sampling_threshold = sampling_threshold 123 124 def expand(self, pcoll): 125 return ( 126 pcoll 127 | 128 'ExtractUserAndTimestamp' >> beam.ParDo(ExtractUserAndTimestampDoFn()) 129 | beam.Filter( 130 lambda x: (abs(hash(x)) <= MAX_TIMESTAMP * self.sampling_threshold)) 131 | ComputeSessions() 132 | 'SessionsToStrings' >> beam.ParDo(SessionsToStringsDoFn()) 133 | TopPerMonth() 134 | 'FormatOutput' >> beam.ParDo(FormatOutputDoFn())) 135 136 137 def run(argv=None): 138 """Runs the Wikipedia top edits pipeline. 139 140 Args: 141 argv: Pipeline options as a list of arguments. 142 """ 143 144 parser = argparse.ArgumentParser() 145 parser.add_argument( 146 '--input', 147 dest='input', 148 default='gs://dataflow-samples/wikipedia_edits/*.json', 149 help='Input specified as a GCS path containing a BigQuery table exported ' 150 'as json.') 151 parser.add_argument( 152 '--output', required=True, help='Output file to write results to.') 153 parser.add_argument( 154 '--sampling_threshold', 155 type=float, 156 default=0.1, 157 help='Fraction of entries used for session tracking') 158 known_args, pipeline_args = parser.parse_known_args(argv) 159 # We use the save_main_session option because one or more DoFn's in this 160 # workflow rely on global context (e.g., a module imported at module level). 161 pipeline_options = PipelineOptions(pipeline_args) 162 pipeline_options.view_as(SetupOptions).save_main_session = True 163 with beam.Pipeline(options=pipeline_options) as p: 164 165 ( # pylint: disable=expression-not-assigned 166 p 167 | ReadFromText(known_args.input) 168 | ComputeTopSessions(known_args.sampling_threshold) 169 | WriteToText(known_args.output)) 170 171 172 if __name__ == '__main__': 173 logging.getLogger().setLevel(logging.INFO) 174 run()