github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/avro_bitcoin.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Collect statistics on transactions in a public bitcoin dataset that was 19 exported to avro 20 21 Usage: 22 export GOOGLE_APPLICATION_CREDENTIALS=/path/to/credentials.json 23 python -m apache_beam.examples.bitcoin \ 24 --compress --fastavro --output fastavro-compressed 25 """ 26 27 # pytype: skip-file 28 29 import argparse 30 import logging 31 32 from fastavro.schema import parse_schema 33 34 import apache_beam as beam 35 from apache_beam.io.avroio import ReadFromAvro 36 from apache_beam.io.avroio import WriteToAvro 37 from apache_beam.metrics import Metrics 38 from apache_beam.options.pipeline_options import PipelineOptions 39 from apache_beam.options.pipeline_options import SetupOptions 40 41 42 class BitcoinTxnCountDoFn(beam.DoFn): 43 """Count inputs and outputs per transaction""" 44 def __init__(self): 45 # TODO(BEAM-6158): Revert the workaround once we can pickle super() on py3. 46 # super().__init__() 47 beam.DoFn.__init__(self) 48 self.txn_counter = Metrics.counter(self.__class__, 'txns') 49 self.inputs_dist = Metrics.distribution(self.__class__, 'inputs_per_txn') 50 self.outputs_dist = Metrics.distribution(self.__class__, 'outputs_per_txn') 51 self.output_amts_dist = Metrics.distribution(self.__class__, 'output_amts') 52 self.txn_amts_dist = Metrics.distribution(self.__class__, 'txn_amts') 53 54 def process(self, elem): 55 """Update counters and distributions, and filter and sum some fields""" 56 57 inputs = elem['inputs'] 58 outputs = elem['outputs'] 59 60 self.txn_counter.inc() 61 62 num_inputs = len(inputs) 63 num_outputs = len(outputs) 64 65 self.inputs_dist.update(num_inputs) 66 self.outputs_dist.update(num_outputs) 67 68 total = 0 69 for output in outputs: 70 amt = output['output_satoshis'] 71 self.output_amts_dist.update(amt) 72 total += amt 73 74 self.txn_amts_dist.update(total) 75 76 return [{ 77 "transaction_id": elem["transaction_id"], 78 "timestamp": elem["timestamp"], 79 "block_id": elem["block_id"], 80 "previous_block": elem["previous_block"], 81 "num_inputs": num_inputs, 82 "num_outputs": num_outputs, 83 "sum_output": total, 84 }] 85 86 87 SCHEMA = parse_schema({ 88 "namespace": "example.avro", 89 "type": "record", 90 "name": "Transaction", 91 "fields": [{ 92 "name": "transaction_id", "type": "string" 93 }, { 94 "name": "timestamp", "type": "long" 95 }, { 96 "name": "block_id", "type": "string" 97 }, { 98 "name": "previous_block", "type": "string" 99 }, { 100 "name": "num_inputs", "type": "int" 101 }, { 102 "name": "num_outputs", "type": "int" 103 }, { 104 "name": "sum_output", "type": "long" 105 }] 106 }) 107 108 109 def run(argv=None): 110 """Test Avro IO (backed by fastavro or Apache Avro) on a simple pipeline 111 that transforms bitcoin transactions""" 112 parser = argparse.ArgumentParser() 113 parser.add_argument( 114 '--input', 115 dest='input', 116 default='gs://beam-avro-test/bitcoin/txns/*', 117 help='Input file(s) to process.') 118 parser.add_argument( 119 '--output', 120 dest='output', 121 required=True, 122 help='Output file to write results to.') 123 parser.add_argument( 124 '--compress', 125 dest='compress', 126 required=False, 127 action='store_true', 128 help='When set, compress the output data') 129 parser.add_argument( 130 '--fastavro', 131 dest='use_fastavro', 132 required=False, 133 action='store_true', 134 help='When set, use fastavro for Avro I/O') 135 136 opts, pipeline_args = parser.parse_known_args(argv) 137 138 # We use the save_main_session option because one or more DoFn's in this 139 # workflow rely on global context (e.g., a module imported at module level). 140 pipeline_options = PipelineOptions(pipeline_args) 141 pipeline_options.view_as(SetupOptions).save_main_session = True 142 p = beam.Pipeline(options=pipeline_options) 143 144 # Read the avro file[pattern] into a PCollection. 145 records = \ 146 p | 'read' >> ReadFromAvro(opts.input) 147 148 measured = records | 'scan' >> beam.ParDo(BitcoinTxnCountDoFn()) 149 150 # pylint: disable=expression-not-assigned 151 measured | 'write' >> \ 152 WriteToAvro( 153 opts.output, 154 schema=SCHEMA, 155 codec=('deflate' if opts.compress else 'null'), 156 ) 157 158 result = p.run() 159 result.wait_until_finish() 160 161 # Do not query metrics when creating a template which doesn't run 162 if (not hasattr(result, 'has_job') # direct runner 163 or result.has_job): # not just a template creation 164 metrics = result.metrics().query() 165 166 for counter in metrics['counters']: 167 logging.info("Counter: %s", counter) 168 169 for dist in metrics['distributions']: 170 logging.info("Distribution: %s", dist) 171 172 173 if __name__ == '__main__': 174 logging.getLogger().setLevel(logging.INFO) 175 run()