github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/avro_bitcoin.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/avro_bitcoin.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Collect statistics on transactions in a public bitcoin dataset that was
    19  exported to avro
    20  
    21  Usage:
    22  export GOOGLE_APPLICATION_CREDENTIALS=/path/to/credentials.json
    23  python -m apache_beam.examples.bitcoin \
    24    --compress --fastavro --output fastavro-compressed
    25  """
    26  
    27  # pytype: skip-file
    28  
    29  import argparse
    30  import logging
    31  
    32  from fastavro.schema import parse_schema
    33  
    34  import apache_beam as beam
    35  from apache_beam.io.avroio import ReadFromAvro
    36  from apache_beam.io.avroio import WriteToAvro
    37  from apache_beam.metrics import Metrics
    38  from apache_beam.options.pipeline_options import PipelineOptions
    39  from apache_beam.options.pipeline_options import SetupOptions
    40  
    41  
    42  class BitcoinTxnCountDoFn(beam.DoFn):
    43    """Count inputs and outputs per transaction"""
    44    def __init__(self):
    45      # TODO(BEAM-6158): Revert the workaround once we can pickle super() on py3.
    46      # super().__init__()
    47      beam.DoFn.__init__(self)
    48      self.txn_counter = Metrics.counter(self.__class__, 'txns')
    49      self.inputs_dist = Metrics.distribution(self.__class__, 'inputs_per_txn')
    50      self.outputs_dist = Metrics.distribution(self.__class__, 'outputs_per_txn')
    51      self.output_amts_dist = Metrics.distribution(self.__class__, 'output_amts')
    52      self.txn_amts_dist = Metrics.distribution(self.__class__, 'txn_amts')
    53  
    54    def process(self, elem):
    55      """Update counters and distributions, and filter and sum some fields"""
    56  
    57      inputs = elem['inputs']
    58      outputs = elem['outputs']
    59  
    60      self.txn_counter.inc()
    61  
    62      num_inputs = len(inputs)
    63      num_outputs = len(outputs)
    64  
    65      self.inputs_dist.update(num_inputs)
    66      self.outputs_dist.update(num_outputs)
    67  
    68      total = 0
    69      for output in outputs:
    70        amt = output['output_satoshis']
    71        self.output_amts_dist.update(amt)
    72        total += amt
    73  
    74      self.txn_amts_dist.update(total)
    75  
    76      return [{
    77          "transaction_id": elem["transaction_id"],
    78          "timestamp": elem["timestamp"],
    79          "block_id": elem["block_id"],
    80          "previous_block": elem["previous_block"],
    81          "num_inputs": num_inputs,
    82          "num_outputs": num_outputs,
    83          "sum_output": total,
    84      }]
    85  
    86  
    87  SCHEMA = parse_schema({
    88      "namespace": "example.avro",
    89      "type": "record",
    90      "name": "Transaction",
    91      "fields": [{
    92          "name": "transaction_id", "type": "string"
    93      }, {
    94          "name": "timestamp", "type": "long"
    95      }, {
    96          "name": "block_id", "type": "string"
    97      }, {
    98          "name": "previous_block", "type": "string"
    99      }, {
   100          "name": "num_inputs", "type": "int"
   101      }, {
   102          "name": "num_outputs", "type": "int"
   103      }, {
   104          "name": "sum_output", "type": "long"
   105      }]
   106  })
   107  
   108  
   109  def run(argv=None):
   110    """Test Avro IO (backed by fastavro or Apache Avro) on a simple pipeline
   111    that transforms bitcoin transactions"""
   112    parser = argparse.ArgumentParser()
   113    parser.add_argument(
   114        '--input',
   115        dest='input',
   116        default='gs://beam-avro-test/bitcoin/txns/*',
   117        help='Input file(s) to process.')
   118    parser.add_argument(
   119        '--output',
   120        dest='output',
   121        required=True,
   122        help='Output file to write results to.')
   123    parser.add_argument(
   124        '--compress',
   125        dest='compress',
   126        required=False,
   127        action='store_true',
   128        help='When set, compress the output data')
   129    parser.add_argument(
   130        '--fastavro',
   131        dest='use_fastavro',
   132        required=False,
   133        action='store_true',
   134        help='When set, use fastavro for Avro I/O')
   135  
   136    opts, pipeline_args = parser.parse_known_args(argv)
   137  
   138    # We use the save_main_session option because one or more DoFn's in this
   139    # workflow rely on global context (e.g., a module imported at module level).
   140    pipeline_options = PipelineOptions(pipeline_args)
   141    pipeline_options.view_as(SetupOptions).save_main_session = True
   142    p = beam.Pipeline(options=pipeline_options)
   143  
   144    # Read the avro file[pattern] into a PCollection.
   145    records = \
   146        p | 'read' >> ReadFromAvro(opts.input)
   147  
   148    measured = records | 'scan' >> beam.ParDo(BitcoinTxnCountDoFn())
   149  
   150    # pylint: disable=expression-not-assigned
   151    measured | 'write' >> \
   152        WriteToAvro(
   153            opts.output,
   154            schema=SCHEMA,
   155            codec=('deflate' if opts.compress else 'null'),
   156        )
   157  
   158    result = p.run()
   159    result.wait_until_finish()
   160  
   161    # Do not query metrics when creating a template which doesn't run
   162    if (not hasattr(result, 'has_job')  # direct runner
   163        or result.has_job):  # not just a template creation
   164      metrics = result.metrics().query()
   165  
   166      for counter in metrics['counters']:
   167        logging.info("Counter: %s", counter)
   168  
   169      for dist in metrics['distributions']:
   170        logging.info("Distribution: %s", dist)
   171  
   172  
   173  if __name__ == '__main__':
   174    logging.getLogger().setLevel(logging.INFO)
   175    run()