github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/snippets/snippets.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Code snippets used in webdocs. 19 20 The examples here are written specifically to read well with the accompanying 21 web docs. Do not rewrite them until you make sure the webdocs still read well 22 and the rewritten code supports the concept being described. For example, there 23 are snippets that could be shorter but they are written like this to make a 24 specific point in the docs. 25 26 The code snippets are all organized as self contained functions. Parts of the 27 function body delimited by [START tag] and [END tag] will be included 28 automatically in the web docs. The naming convention for the tags is to have as 29 prefix the PATH_TO_HTML where they are included followed by a descriptive 30 string. The tags can contain only letters, digits and _. 31 """ 32 # pytype: skip-file 33 34 import argparse 35 import base64 36 import json 37 from decimal import Decimal 38 39 import mock 40 41 import apache_beam as beam 42 from apache_beam.io import iobase 43 from apache_beam.io.range_trackers import OffsetRangeTracker 44 from apache_beam.metrics import Metrics 45 from apache_beam.options.pipeline_options import PipelineOptions 46 from apache_beam.testing.test_pipeline import TestPipeline 47 from apache_beam.testing.util import assert_that 48 from apache_beam.testing.util import equal_to 49 from apache_beam.transforms.core import PTransform 50 51 # Protect against environments where Google Cloud Natural Language client is 52 # not available. 53 try: 54 from apache_beam.ml.gcp import naturallanguageml as nlp 55 except ImportError: 56 nlp = None 57 58 # Quiet some pylint warnings that happen because of the somewhat special 59 # format for the code snippets. 60 # pylint:disable=invalid-name 61 # pylint:disable=expression-not-assigned 62 # pylint:disable=redefined-outer-name 63 # pylint:disable=reimported 64 # pylint:disable=unused-variable 65 # pylint:disable=wrong-import-order, wrong-import-position 66 67 68 class SnippetUtils(object): 69 from apache_beam.pipeline import PipelineVisitor 70 71 class RenameFiles(PipelineVisitor): 72 """RenameFiles will rewire read/write paths for unit testing. 73 74 RenameFiles will replace the GCS files specified in the read and 75 write transforms to local files so the pipeline can be run as a 76 unit test. This assumes that read and write transforms defined in snippets 77 have already been replaced by transforms 'DummyReadForTesting' and 78 'DummyReadForTesting' (see snippets_test.py). 79 80 This is as close as we can get to have code snippets that are 81 executed and are also ready to presented in webdocs. 82 """ 83 def __init__(self, renames): 84 self.renames = renames 85 86 def visit_transform(self, transform_node): 87 if transform_node.full_label.find('DummyReadForTesting') >= 0: 88 transform_node.transform.fn.file_to_read = self.renames['read'] 89 elif transform_node.full_label.find('DummyWriteForTesting') >= 0: 90 transform_node.transform.fn.file_to_write = self.renames['write'] 91 92 93 @mock.patch('apache_beam.Pipeline', TestPipeline) 94 def construct_pipeline(renames): 95 """A reverse words snippet as an example for constructing a pipeline.""" 96 import re 97 98 # This is duplicate of the import statement in 99 # pipelines_constructing_creating tag below, but required to avoid 100 # Unresolved reference in ReverseWords class 101 import apache_beam as beam 102 103 @beam.ptransform_fn 104 @beam.typehints.with_input_types(str) 105 @beam.typehints.with_output_types(str) 106 def ReverseWords(pcoll): 107 """A PTransform that reverses individual elements in a PCollection.""" 108 return pcoll | beam.Map(lambda word: word[::-1]) 109 110 def filter_words(unused_x): 111 """Pass through filter to select everything.""" 112 return True 113 114 # [START pipelines_constructing_creating] 115 import apache_beam as beam 116 117 with beam.Pipeline() as pipeline: 118 pass # build your pipeline here 119 # [END pipelines_constructing_creating] 120 121 # [START pipelines_constructing_reading] 122 lines = pipeline | 'ReadMyFile' >> beam.io.ReadFromText( 123 'gs://some/inputData.txt') 124 # [END pipelines_constructing_reading] 125 126 # [START pipelines_constructing_applying] 127 words = lines | beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)) 128 reversed_words = words | ReverseWords() 129 # [END pipelines_constructing_applying] 130 131 # [START pipelines_constructing_writing] 132 filtered_words = reversed_words | 'FilterWords' >> beam.Filter(filter_words) 133 filtered_words | 'WriteMyFile' >> beam.io.WriteToText( 134 'gs://some/outputData.txt') 135 # [END pipelines_constructing_writing] 136 137 pipeline.visit(SnippetUtils.RenameFiles(renames)) 138 139 140 def model_pipelines(): 141 """A wordcount snippet as a simple pipeline example.""" 142 # [START model_pipelines] 143 import argparse 144 import re 145 146 import apache_beam as beam 147 from apache_beam.options.pipeline_options import PipelineOptions 148 149 parser = argparse.ArgumentParser() 150 parser.add_argument( 151 '--input-file', 152 default='gs://dataflow-samples/shakespeare/kinglear.txt', 153 help='The file path for the input text to process.') 154 parser.add_argument( 155 '--output-path', required=True, help='The path prefix for output files.') 156 args, beam_args = parser.parse_known_args() 157 158 beam_options = PipelineOptions(beam_args) 159 with beam.Pipeline(options=beam_options) as pipeline: 160 ( 161 pipeline 162 | beam.io.ReadFromText(args.input_file) 163 | beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)) 164 | beam.Map(lambda x: (x, 1)) 165 | beam.combiners.Count.PerKey() 166 | beam.io.WriteToText(args.output_path)) 167 # [END model_pipelines] 168 169 170 def model_pcollection(output_path): 171 """Creating a PCollection from data in local memory.""" 172 # [START model_pcollection] 173 import apache_beam as beam 174 175 with beam.Pipeline() as pipeline: 176 lines = ( 177 pipeline 178 | beam.Create([ 179 'To be, or not to be: that is the question: ', 180 "Whether 'tis nobler in the mind to suffer ", 181 'The slings and arrows of outrageous fortune, ', 182 'Or to take arms against a sea of troubles, ', 183 ])) 184 # [END model_pcollection] 185 186 lines | beam.io.WriteToText(output_path) 187 188 189 def pipeline_options_remote(): 190 """Creating a Pipeline using a PipelineOptions object for remote execution.""" 191 192 # [START pipeline_options_create] 193 from apache_beam.options.pipeline_options import PipelineOptions 194 195 beam_options = PipelineOptions() 196 # [END pipeline_options_create] 197 198 # [START pipeline_options_define_custom] 199 from apache_beam.options.pipeline_options import PipelineOptions 200 201 class MyOptions(PipelineOptions): 202 @classmethod 203 def _add_argparse_args(cls, parser): 204 parser.add_argument('--input') 205 parser.add_argument('--output') 206 207 # [END pipeline_options_define_custom] 208 209 @mock.patch('apache_beam.Pipeline') 210 def dataflow_options(mock_pipeline): 211 # [START pipeline_options_dataflow_service] 212 import argparse 213 214 import apache_beam as beam 215 from apache_beam.options.pipeline_options import PipelineOptions 216 217 parser = argparse.ArgumentParser() 218 # parser.add_argument('--my-arg', help='description') 219 args, beam_args = parser.parse_known_args() 220 221 # Create and set your PipelineOptions. 222 # For Cloud execution, specify DataflowRunner and set the Cloud Platform 223 # project, job name, temporary files location, and region. 224 # For more information about regions, check: 225 # https://cloud.google.com/dataflow/docs/concepts/regional-endpoints 226 beam_options = PipelineOptions( 227 beam_args, 228 runner='DataflowRunner', 229 project='my-project-id', 230 job_name='unique-job-name', 231 temp_location='gs://my-bucket/temp', 232 region='us-central1') 233 # Note: Repeatable options like dataflow_service_options or experiments must 234 # be specified as a list of string(s). 235 # e.g. dataflow_service_options=['enable_prime'] 236 237 # Create the Pipeline with the specified options. 238 with beam.Pipeline(options=beam_options) as pipeline: 239 pass # build your pipeline here. 240 # [END pipeline_options_dataflow_service] 241 return beam_options 242 243 beam_options = dataflow_options() 244 args = beam_options.view_as(MyOptions) 245 246 with TestPipeline() as pipeline: # Use TestPipeline for testing. 247 lines = pipeline | beam.io.ReadFromText(args.input) 248 lines | beam.io.WriteToText(args.output) 249 250 251 @mock.patch('apache_beam.Pipeline', TestPipeline) 252 def pipeline_options_local(): 253 """Creating a Pipeline using a PipelineOptions object for local execution.""" 254 255 # [START pipeline_options_define_custom_with_help_and_default] 256 from apache_beam.options.pipeline_options import PipelineOptions 257 258 class MyOptions(PipelineOptions): 259 @classmethod 260 def _add_argparse_args(cls, parser): 261 parser.add_argument( 262 '--input', 263 default='gs://dataflow-samples/shakespeare/kinglear.txt', 264 help='The file path for the input text to process.') 265 parser.add_argument( 266 '--output', required=True, help='The path prefix for output files.') 267 268 # [END pipeline_options_define_custom_with_help_and_default] 269 270 # [START pipeline_options_local] 271 import argparse 272 273 import apache_beam as beam 274 from apache_beam.options.pipeline_options import PipelineOptions 275 276 parser = argparse.ArgumentParser() 277 # parser.add_argument('--my-arg') 278 args, beam_args = parser.parse_known_args() 279 280 # Create and set your Pipeline Options. 281 beam_options = PipelineOptions(beam_args) 282 args = beam_options.view_as(MyOptions) 283 284 with beam.Pipeline(options=beam_options) as pipeline: 285 lines = ( 286 pipeline 287 | beam.io.ReadFromText(args.input) 288 | beam.io.WriteToText(args.output)) 289 # [END pipeline_options_local] 290 291 292 @mock.patch('apache_beam.Pipeline', TestPipeline) 293 def pipeline_options_command_line(): 294 """Creating a Pipeline by passing a list of arguments.""" 295 296 # [START pipeline_options_command_line] 297 # Use Python argparse module to parse custom arguments 298 import argparse 299 300 import apache_beam as beam 301 from apache_beam.options.pipeline_options import PipelineOptions 302 303 # For more details on how to use argparse, take a look at: 304 # https://docs.python.org/3/library/argparse.html 305 parser = argparse.ArgumentParser() 306 parser.add_argument( 307 '--input-file', 308 default='gs://dataflow-samples/shakespeare/kinglear.txt', 309 help='The file path for the input text to process.') 310 parser.add_argument( 311 '--output-path', required=True, help='The path prefix for output files.') 312 args, beam_args = parser.parse_known_args() 313 314 # Create the Pipeline with remaining arguments. 315 beam_options = PipelineOptions(beam_args) 316 with beam.Pipeline(options=beam_options) as pipeline: 317 lines = ( 318 pipeline 319 | 'Read files' >> beam.io.ReadFromText(args.input_file) 320 | 'Write files' >> beam.io.WriteToText(args.output_path)) 321 # [END pipeline_options_command_line] 322 323 324 def pipeline_logging(lines, output): 325 """Logging Pipeline Messages.""" 326 327 import re 328 import apache_beam as beam 329 330 # [START pipeline_logging] 331 # import Python logging module. 332 import logging 333 334 class ExtractWordsFn(beam.DoFn): 335 def process(self, element): 336 words = re.findall(r'[A-Za-z\']+', element) 337 for word in words: 338 yield word 339 340 if word.lower() == 'love': 341 # Log using the root logger at info or higher levels 342 logging.info('Found : %s', word.lower()) 343 344 # Remaining WordCount example code ... 345 # [END pipeline_logging] 346 347 with TestPipeline() as pipeline: # Use TestPipeline for testing. 348 ( 349 pipeline 350 | beam.Create(lines) 351 | beam.ParDo(ExtractWordsFn()) 352 | beam.io.WriteToText(output)) 353 354 355 def pipeline_monitoring(): 356 """Using monitoring interface snippets.""" 357 358 import argparse 359 import re 360 import apache_beam as beam 361 362 class ExtractWordsFn(beam.DoFn): 363 def process(self, element): 364 words = re.findall(r'[A-Za-z\']+', element) 365 for word in words: 366 yield word 367 368 class FormatCountsFn(beam.DoFn): 369 def process(self, element): 370 word, count = element 371 yield '%s: %s' % (word, count) 372 373 # [START pipeline_monitoring_composite] 374 # The CountWords Composite Transform inside the WordCount pipeline. 375 @beam.ptransform_fn 376 def CountWords(pcoll): 377 return ( 378 pcoll 379 # Convert lines of text into individual words. 380 | 'ExtractWords' >> beam.ParDo(ExtractWordsFn()) 381 # Count the number of times each word occurs. 382 | beam.combiners.Count.PerElement() 383 # Format each word and count into a printable string. 384 | 'FormatCounts' >> beam.ParDo(FormatCountsFn())) 385 386 # [END pipeline_monitoring_composite] 387 388 parser = argparse.ArgumentParser() 389 parser.add_argument( 390 '--input-file', 391 default='gs://dataflow-samples/shakespeare/kinglear.txt', 392 help='The file path for the input text to process.') 393 parser.add_argument( 394 '--output-path', required=True, help='The path prefix for output files.') 395 args, _ = parser.parse_known_args() 396 397 with TestPipeline() as pipeline: # Use TestPipeline for testing. 398 399 # [START pipeline_monitoring_execution] 400 ( 401 pipeline 402 # Read the lines of the input text. 403 | 'ReadLines' >> beam.io.ReadFromText(args.input_file) 404 # Count the words. 405 | CountWords() 406 # Write the formatted word counts to output. 407 | 'WriteCounts' >> beam.io.WriteToText(args.output_path)) 408 # [END pipeline_monitoring_execution] 409 410 411 def examples_wordcount_minimal(): 412 """MinimalWordCount example snippets.""" 413 import re 414 415 import apache_beam as beam 416 417 # [START examples_wordcount_minimal_options] 418 from apache_beam.options.pipeline_options import PipelineOptions 419 420 input_file = 'gs://dataflow-samples/shakespeare/kinglear.txt' 421 output_path = 'gs://my-bucket/counts.txt' 422 423 beam_options = PipelineOptions( 424 runner='DataflowRunner', 425 project='my-project-id', 426 job_name='unique-job-name', 427 temp_location='gs://my-bucket/temp', 428 ) 429 # [END examples_wordcount_minimal_options] 430 431 # Run it locally for testing. 432 import argparse 433 434 parser = argparse.ArgumentParser() 435 parser.add_argument('--input-file') 436 parser.add_argument('--output-path') 437 args, beam_args = parser.parse_known_args() 438 439 input_file = args.input_file 440 output_path = args.output_path 441 442 beam_options = PipelineOptions(beam_args) 443 444 # [START examples_wordcount_minimal_create] 445 pipeline = beam.Pipeline(options=beam_options) 446 # [END examples_wordcount_minimal_create] 447 448 ( 449 # [START examples_wordcount_minimal_read] 450 pipeline 451 | beam.io.ReadFromText(input_file) 452 # [END examples_wordcount_minimal_read] 453 454 # [START examples_wordcount_minimal_pardo] 455 | 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)) 456 # [END examples_wordcount_minimal_pardo] 457 458 # [START examples_wordcount_minimal_count] 459 | beam.combiners.Count.PerElement() 460 # [END examples_wordcount_minimal_count] 461 462 # [START examples_wordcount_minimal_map] 463 | beam.MapTuple(lambda word, count: '%s: %s' % (word, count)) 464 # [END examples_wordcount_minimal_map] 465 466 # [START examples_wordcount_minimal_write] 467 | beam.io.WriteToText(output_path) 468 # [END examples_wordcount_minimal_write] 469 ) 470 471 # [START examples_wordcount_minimal_run] 472 result = pipeline.run() 473 # [END examples_wordcount_minimal_run] 474 result.wait_until_finish() 475 476 477 def examples_wordcount_wordcount(): 478 """WordCount example snippets.""" 479 import re 480 481 import apache_beam as beam 482 from apache_beam.options.pipeline_options import PipelineOptions 483 484 # [START examples_wordcount_wordcount_options] 485 import argparse 486 487 parser = argparse.ArgumentParser() 488 parser.add_argument( 489 '--input-file', 490 default='gs://dataflow-samples/shakespeare/kinglear.txt', 491 help='The file path for the input text to process.') 492 parser.add_argument( 493 '--output-path', required=True, help='The path prefix for output files.') 494 args, beam_args = parser.parse_known_args() 495 496 beam_options = PipelineOptions(beam_args) 497 with beam.Pipeline(options=beam_options) as pipeline: 498 lines = pipeline | beam.io.ReadFromText(args.input_file) 499 500 # [END examples_wordcount_wordcount_options] 501 502 # [START examples_wordcount_wordcount_composite] 503 @beam.ptransform_fn 504 def CountWords(pcoll): 505 return ( 506 pcoll 507 # Convert lines of text into individual words. 508 | 'ExtractWords' >> 509 beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)) 510 511 # Count the number of times each word occurs. 512 | beam.combiners.Count.PerElement()) 513 514 counts = lines | CountWords() 515 516 # [END examples_wordcount_wordcount_composite] 517 518 # [START examples_wordcount_wordcount_dofn] 519 class FormatAsTextFn(beam.DoFn): 520 def process(self, element): 521 word, count = element 522 yield '%s: %s' % (word, count) 523 524 formatted = counts | beam.ParDo(FormatAsTextFn()) 525 # [END examples_wordcount_wordcount_dofn] 526 527 formatted | beam.io.WriteToText(args.output_path) 528 529 530 def examples_wordcount_templated(): 531 """Templated WordCount example snippet.""" 532 import re 533 534 import apache_beam as beam 535 from apache_beam.io import ReadFromText 536 from apache_beam.io import WriteToText 537 from apache_beam.options.pipeline_options import PipelineOptions 538 539 # [START example_wordcount_templated] 540 class WordcountTemplatedOptions(PipelineOptions): 541 @classmethod 542 def _add_argparse_args(cls, parser): 543 # Use add_value_provider_argument for arguments to be templatable 544 # Use add_argument as usual for non-templatable arguments 545 parser.add_value_provider_argument( 546 '--input-file', 547 default='gs://dataflow-samples/shakespeare/kinglear.txt', 548 help='The file path for the input text to process.') 549 parser.add_argument( 550 '--output-path', 551 required=True, 552 help='The path prefix for output files.') 553 554 beam_options = PipelineOptions() 555 args = beam_options.view_as(WordcountTemplatedOptions) 556 557 with beam.Pipeline(options=beam_options) as pipeline: 558 lines = pipeline | 'Read' >> ReadFromText(args.input_file.get()) 559 560 # [END example_wordcount_templated] 561 562 def format_result(word_count): 563 (word, count) = word_count 564 return '%s: %s' % (word, count) 565 566 ( 567 lines 568 | 569 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)) 570 | 'PairWithOnes' >> beam.Map(lambda x: (x, 1)) 571 | 'Group' >> beam.GroupByKey() 572 | 573 'Sum' >> beam.Map(lambda word_ones: (word_ones[0], sum(word_ones[1]))) 574 | 'Format' >> beam.Map(format_result) 575 | 'Write' >> WriteToText(args.output_path)) 576 577 578 def examples_wordcount_debugging(renames): 579 """DebuggingWordCount example snippets.""" 580 import re 581 582 import apache_beam as beam 583 584 # [START example_wordcount_debugging_logging] 585 # [START example_wordcount_debugging_aggregators] 586 import logging 587 588 class FilterTextFn(beam.DoFn): 589 """A DoFn that filters for a specific key based on a regular expression.""" 590 def __init__(self, pattern): 591 self.pattern = pattern 592 # A custom metric can track values in your pipeline as it runs. Create 593 # custom metrics matched_word and unmatched_words. 594 self.matched_words = Metrics.counter(self.__class__, 'matched_words') 595 self.umatched_words = Metrics.counter(self.__class__, 'umatched_words') 596 597 def process(self, element): 598 word, _ = element 599 if re.match(self.pattern, word): 600 # Log at INFO level each element we match. When executing this pipeline 601 # using the Dataflow service, these log lines will appear in the Cloud 602 # Logging UI. 603 logging.info('Matched %s', word) 604 605 # Add 1 to the custom metric counter matched_words 606 self.matched_words.inc() 607 yield element 608 else: 609 # Log at the "DEBUG" level each element that is not matched. Different 610 # log levels can be used to control the verbosity of logging providing 611 # an effective mechanism to filter less important information. Note 612 # currently only "INFO" and higher level logs are emitted to the Cloud 613 # Logger. This log message will not be visible in the Cloud Logger. 614 logging.debug('Did not match %s', word) 615 616 # Add 1 to the custom metric counter umatched_words 617 self.umatched_words.inc() 618 619 # [END example_wordcount_debugging_logging] 620 # [END example_wordcount_debugging_aggregators] 621 622 with TestPipeline() as pipeline: # Use TestPipeline for testing. 623 filtered_words = ( 624 pipeline 625 | 626 beam.io.ReadFromText('gs://dataflow-samples/shakespeare/kinglear.txt') 627 | 628 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)) 629 | beam.combiners.Count.PerElement() 630 | 'FilterText' >> beam.ParDo(FilterTextFn('Flourish|stomach'))) 631 632 # [START example_wordcount_debugging_assert] 633 beam.testing.util.assert_that( 634 filtered_words, 635 beam.testing.util.equal_to([('Flourish', 3), ('stomach', 1)])) 636 637 # [END example_wordcount_debugging_assert] 638 639 def format_result(word_count): 640 (word, count) = word_count 641 return '%s: %s' % (word, count) 642 643 output = ( 644 filtered_words 645 | 'format' >> beam.Map(format_result) 646 | 'Write' >> beam.io.WriteToText('gs://my-bucket/counts.txt')) 647 648 pipeline.visit(SnippetUtils.RenameFiles(renames)) 649 650 651 def examples_wordcount_streaming(): 652 import apache_beam as beam 653 from apache_beam import window 654 from apache_beam.options.pipeline_options import PipelineOptions 655 656 # Parse out arguments. 657 parser = argparse.ArgumentParser() 658 parser.add_argument( 659 '--output_topic', 660 required=True, 661 help=( 662 'Output PubSub topic of the form ' 663 '"projects/<PROJECT>/topic/<TOPIC>".')) 664 group = parser.add_mutually_exclusive_group(required=True) 665 group.add_argument( 666 '--input_topic', 667 help=( 668 'Input PubSub topic of the form ' 669 '"projects/<PROJECT>/topics/<TOPIC>".')) 670 group.add_argument( 671 '--input_subscription', 672 help=( 673 'Input PubSub subscription of the form ' 674 '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."')) 675 args, beam_args = parser.parse_known_args() 676 677 beam_options = PipelineOptions(beam_args, streaming=True) 678 679 with TestPipeline(options=beam_options) as pipeline: 680 # [START example_wordcount_streaming_read] 681 # Read from Pub/Sub into a PCollection. 682 if args.input_subscription: 683 lines = pipeline | beam.io.ReadFromPubSub( 684 subscription=args.input_subscription) 685 else: 686 lines = pipeline | beam.io.ReadFromPubSub(topic=args.input_topic) 687 # [END example_wordcount_streaming_read] 688 689 output = ( 690 lines 691 | 'DecodeUnicode' >> beam.Map(lambda encoded: encoded.decode('utf-8')) 692 | 'ExtractWords' >> 693 beam.FlatMap(lambda x: __import__('re').findall(r'[A-Za-z\']+', x)) 694 | 'PairWithOnes' >> beam.Map(lambda x: (x, 1)) 695 | beam.WindowInto(window.FixedWindows(15, 0)) 696 | 'Group' >> beam.GroupByKey() 697 | 698 'Sum' >> beam.Map(lambda word_ones: (word_ones[0], sum(word_ones[1]))) 699 | 'Format' >> 700 beam.MapTuple(lambda word, count: f'{word}: {count}'.encode('utf-8'))) 701 702 # [START example_wordcount_streaming_write] 703 # Write to Pub/Sub 704 output | beam.io.WriteToPubSub(args.output_topic) 705 # [END example_wordcount_streaming_write] 706 707 708 def examples_ptransforms_templated(renames): 709 # [START examples_ptransforms_templated] 710 import apache_beam as beam 711 from apache_beam.io import WriteToText 712 from apache_beam.options.pipeline_options import PipelineOptions 713 from apache_beam.options.value_provider import StaticValueProvider 714 715 class TemplatedUserOptions(PipelineOptions): 716 @classmethod 717 def _add_argparse_args(cls, parser): 718 parser.add_value_provider_argument('--templated_int', type=int) 719 720 class MySumFn(beam.DoFn): 721 def __init__(self, templated_int): 722 self.templated_int = templated_int 723 724 def process(self, an_int): 725 yield self.templated_int.get() + an_int 726 727 beam_options = PipelineOptions() 728 args = beam_options.view_as(TemplatedUserOptions) 729 730 with beam.Pipeline(options=beam_options) as pipeline: 731 my_sum_fn = MySumFn(args.templated_int) 732 sum = ( 733 pipeline 734 | 'ReadCollection' >> 735 beam.io.ReadFromText('gs://some/integer_collection') 736 | 'StringToInt' >> beam.Map(lambda w: int(w)) 737 | 'AddGivenInt' >> beam.ParDo(my_sum_fn) 738 | 'WriteResultingCollection' >> WriteToText('some/output_path')) 739 # [END examples_ptransforms_templated] 740 741 # Templates are not supported by DirectRunner (only by DataflowRunner) 742 # so a value must be provided at graph-construction time 743 my_sum_fn.templated_int = StaticValueProvider(int, 10) 744 745 pipeline.visit(SnippetUtils.RenameFiles(renames)) 746 747 748 # Defining a new source. 749 # [START model_custom_source_new_source] 750 class CountingSource(iobase.BoundedSource): 751 def __init__(self, count): 752 self.records_read = Metrics.counter(self.__class__, 'recordsRead') 753 self._count = count 754 755 def estimate_size(self): 756 return self._count 757 758 def get_range_tracker(self, start_position, stop_position): 759 if start_position is None: 760 start_position = 0 761 if stop_position is None: 762 stop_position = self._count 763 764 return OffsetRangeTracker(start_position, stop_position) 765 766 def read(self, range_tracker): 767 for i in range(range_tracker.start_position(), 768 range_tracker.stop_position()): 769 if not range_tracker.try_claim(i): 770 return 771 self.records_read.inc() 772 yield i 773 774 def split(self, desired_bundle_size, start_position=None, stop_position=None): 775 if start_position is None: 776 start_position = 0 777 if stop_position is None: 778 stop_position = self._count 779 780 bundle_start = start_position 781 while bundle_start < stop_position: 782 bundle_stop = min(stop_position, bundle_start + desired_bundle_size) 783 yield iobase.SourceBundle( 784 weight=(bundle_stop - bundle_start), 785 source=self, 786 start_position=bundle_start, 787 stop_position=bundle_stop) 788 bundle_start = bundle_stop 789 790 791 # [END model_custom_source_new_source] 792 793 794 # We recommend users to start Source classes with an underscore to discourage 795 # using the Source class directly when a PTransform for the source is 796 # available. We simulate that here by simply extending the previous Source 797 # class. 798 class _CountingSource(CountingSource): 799 pass 800 801 802 # [START model_custom_source_new_ptransform] 803 class ReadFromCountingSource(PTransform): 804 def __init__(self, count): 805 super().__init__() 806 self._count = count 807 808 def expand(self, pcoll): 809 return pcoll | iobase.Read(_CountingSource(self._count)) 810 811 812 # [END model_custom_source_new_ptransform] 813 814 815 def model_custom_source(count): 816 """Demonstrates creating a new custom source and using it in a pipeline. 817 818 Defines a new source ``CountingSource`` that produces integers starting from 0 819 up to a given size. 820 821 Uses the new source in an example pipeline. 822 823 Additionally demonstrates how a source should be implemented using a 824 ``PTransform``. This is the recommended way to develop sources that are to 825 distributed to a large number of end users. 826 827 This method runs two pipelines. 828 829 (1) A pipeline that uses ``CountingSource`` directly using the ``df.Read`` 830 transform. 831 (2) A pipeline that uses a custom ``PTransform`` that wraps 832 ``CountingSource``. 833 834 Args: 835 count: the size of the counting source to be used in the pipeline 836 demonstrated in this method. 837 838 """ 839 840 # Using the source in an example pipeline. 841 # [START model_custom_source_use_new_source] 842 with beam.Pipeline() as pipeline: 843 numbers = pipeline | 'ProduceNumbers' >> beam.io.Read(CountingSource(count)) 844 # [END model_custom_source_use_new_source] 845 846 lines = numbers | beam.core.Map(lambda number: 'line %d' % number) 847 assert_that( 848 lines, equal_to(['line ' + str(number) for number in range(0, count)])) 849 850 # [START model_custom_source_use_ptransform] 851 with beam.Pipeline() as pipeline: 852 numbers = pipeline | 'ProduceNumbers' >> ReadFromCountingSource(count) 853 # [END model_custom_source_use_ptransform] 854 855 lines = numbers | beam.core.Map(lambda number: 'line %d' % number) 856 assert_that( 857 lines, equal_to(['line ' + str(number) for number in range(0, count)])) 858 859 860 # Defining the new sink. 861 # 862 # Defines a new sink ``SimpleKVSink`` that demonstrates writing to a simple 863 # key-value based storage system which has following API. 864 # 865 # simplekv.connect(url) - 866 # connects to the storage system and returns an access token which can be 867 # used to perform further operations 868 # simplekv.open_table(access_token, table_name) - 869 # creates a table named 'table_name'. Returns a table object. 870 # simplekv.write_to_table(access_token, table, key, value) - 871 # writes a key-value pair to the given table. 872 # simplekv.rename_table(access_token, old_name, new_name) - 873 # renames the table named 'old_name' to 'new_name'. 874 # 875 # [START model_custom_sink_new_sink] 876 class SimpleKVSink(iobase.Sink): 877 def __init__(self, simplekv, url, final_table_name): 878 self._simplekv = simplekv 879 self._url = url 880 self._final_table_name = final_table_name 881 882 def initialize_write(self): 883 access_token = self._simplekv.connect(self._url) 884 return access_token 885 886 def open_writer(self, access_token, uid): 887 table_name = 'table' + uid 888 return SimpleKVWriter(self._simplekv, access_token, table_name) 889 890 def pre_finalize(self, init_result, writer_results): 891 pass 892 893 def finalize_write(self, access_token, table_names, pre_finalize_result): 894 for i, table_name in enumerate(table_names): 895 self._simplekv.rename_table( 896 access_token, table_name, self._final_table_name + str(i)) 897 898 899 # [END model_custom_sink_new_sink] 900 901 902 # Defining a writer for the new sink. 903 # [START model_custom_sink_new_writer] 904 class SimpleKVWriter(iobase.Writer): 905 def __init__(self, simplekv, access_token, table_name): 906 self._simplekv = simplekv 907 self._access_token = access_token 908 self._table_name = table_name 909 self._table = self._simplekv.open_table(access_token, table_name) 910 911 def write(self, record): 912 key, value = record 913 914 self._simplekv.write_to_table(self._access_token, self._table, key, value) 915 916 def close(self): 917 return self._table_name 918 919 920 # [END model_custom_sink_new_writer] 921 922 923 # [START model_custom_sink_new_ptransform] 924 class WriteToKVSink(PTransform): 925 def __init__(self, simplekv, url, final_table_name): 926 self._simplekv = simplekv 927 super().__init__() 928 self._url = url 929 self._final_table_name = final_table_name 930 931 def expand(self, pcoll): 932 return pcoll | iobase.Write( 933 _SimpleKVSink(self._simplekv, self._url, self._final_table_name)) 934 935 936 # [END model_custom_sink_new_ptransform] 937 938 939 # We recommend users to start Sink class names with an underscore to 940 # discourage using the Sink class directly when a PTransform for the sink is 941 # available. We simulate that here by simply extending the previous Sink 942 # class. 943 class _SimpleKVSink(SimpleKVSink): 944 pass 945 946 947 def model_custom_sink( 948 simplekv, 949 KVs, 950 final_table_name_no_ptransform, 951 final_table_name_with_ptransform): 952 """Demonstrates creating a new custom sink and using it in a pipeline. 953 954 Uses the new sink in an example pipeline. 955 956 Additionally demonstrates how a sink should be implemented using a 957 ``PTransform``. This is the recommended way to develop sinks that are to be 958 distributed to a large number of end users. 959 960 This method runs two pipelines. 961 962 (1) A pipeline that uses ``SimpleKVSink`` directly using the ``df.Write`` 963 transform. 964 (2) A pipeline that uses a custom ``PTransform`` that wraps 965 ``SimpleKVSink``. 966 967 Args: 968 simplekv: an object that mocks the key-value storage. 969 970 KVs: the set of key-value pairs to be written in the example pipeline. 971 972 final_table_name_no_ptransform: the prefix of final set of tables to be 973 created by the example pipeline that uses 974 ``SimpleKVSink`` directly. 975 976 final_table_name_with_ptransform: the prefix of final set of tables to be 977 created by the example pipeline that uses 978 a ``PTransform`` that wraps 979 ``SimpleKVSink``. 980 """ 981 982 final_table_name = final_table_name_no_ptransform 983 984 # Using the new sink in an example pipeline. 985 # [START model_custom_sink_use_new_sink] 986 with beam.Pipeline(options=PipelineOptions()) as pipeline: 987 kvs = pipeline | 'CreateKVs' >> beam.Create(KVs) 988 989 kvs | 'WriteToSimpleKV' >> beam.io.Write( 990 SimpleKVSink(simplekv, 'http://url_to_simple_kv/', final_table_name)) 991 # [END model_custom_sink_use_new_sink] 992 993 final_table_name = final_table_name_with_ptransform 994 995 # [START model_custom_sink_use_ptransform] 996 with beam.Pipeline(options=PipelineOptions()) as pipeline: 997 kvs = pipeline | 'CreateKVs' >> beam.core.Create(KVs) 998 kvs | 'WriteToSimpleKV' >> WriteToKVSink( 999 simplekv, 'http://url_to_simple_kv/', final_table_name) 1000 # [END model_custom_sink_use_ptransform] 1001 1002 1003 def model_textio(renames): 1004 """Using a Read and Write transform to read/write text files.""" 1005 def filter_words(x): 1006 import re 1007 return re.findall(r'[A-Za-z\']+', x) 1008 1009 # [START model_textio_read] 1010 with beam.Pipeline(options=PipelineOptions()) as pipeline: 1011 # [START model_pipelineio_read] 1012 lines = pipeline | 'ReadFromText' >> beam.io.ReadFromText( 1013 'path/to/input-*.csv') 1014 # [END model_pipelineio_read] 1015 # [END model_textio_read] 1016 1017 # [START model_textio_write] 1018 filtered_words = lines | 'FilterWords' >> beam.FlatMap(filter_words) 1019 # [START model_pipelineio_write] 1020 filtered_words | 'WriteToText' >> beam.io.WriteToText( 1021 '/path/to/numbers', file_name_suffix='.csv') 1022 # [END model_pipelineio_write] 1023 # [END model_textio_write] 1024 1025 pipeline.visit(SnippetUtils.RenameFiles(renames)) 1026 1027 1028 def model_textio_compressed(renames, expected): 1029 """Using a Read Transform to read compressed text files.""" 1030 with TestPipeline() as pipeline: 1031 1032 # [START model_textio_write_compressed] 1033 lines = pipeline | 'ReadFromText' >> beam.io.ReadFromText( 1034 '/path/to/input-*.csv.gz', 1035 compression_type=beam.io.filesystem.CompressionTypes.GZIP) 1036 # [END model_textio_write_compressed] 1037 1038 assert_that(lines, equal_to(expected)) 1039 pipeline.visit(SnippetUtils.RenameFiles(renames)) 1040 1041 1042 def model_datastoreio(): 1043 """Using a Read and Write transform to read/write to Cloud Datastore.""" 1044 1045 import uuid 1046 import apache_beam as beam 1047 from apache_beam.options.pipeline_options import PipelineOptions 1048 from apache_beam.io.gcp.datastore.v1new.datastoreio import ReadFromDatastore 1049 from apache_beam.io.gcp.datastore.v1new.datastoreio import WriteToDatastore 1050 from apache_beam.io.gcp.datastore.v1new.types import Entity 1051 from apache_beam.io.gcp.datastore.v1new.types import Key 1052 from apache_beam.io.gcp.datastore.v1new.types import Query 1053 1054 project = 'my_project' 1055 kind = 'my_kind' 1056 query = Query(kind, project) 1057 1058 # [START model_datastoreio_read] 1059 pipeline = beam.Pipeline(options=PipelineOptions()) 1060 entities = pipeline | 'Read From Datastore' >> ReadFromDatastore(query) 1061 # [END model_datastoreio_read] 1062 1063 # [START model_datastoreio_write] 1064 pipeline = beam.Pipeline(options=PipelineOptions()) 1065 musicians = pipeline | 'Musicians' >> beam.Create( 1066 ['Mozart', 'Chopin', 'Beethoven', 'Vivaldi']) 1067 1068 def to_entity(content): 1069 key = Key([kind, str(uuid.uuid4())]) 1070 entity = Entity(key) 1071 entity.set_properties({'content': content}) 1072 return entity 1073 1074 entities = musicians | 'To Entity' >> beam.Map(to_entity) 1075 entities | 'Write To Datastore' >> WriteToDatastore(project) 1076 # [END model_datastoreio_write] 1077 1078 1079 def model_bigqueryio( 1080 pipeline, write_project='', write_dataset='', write_table=''): 1081 """Using a Read and Write transform to read/write from/to BigQuery.""" 1082 1083 # [START model_bigqueryio_table_spec] 1084 # project-id:dataset_id.table_id 1085 table_spec = 'clouddataflow-readonly:samples.weather_stations' 1086 # [END model_bigqueryio_table_spec] 1087 1088 # [START model_bigqueryio_table_spec_without_project] 1089 # dataset_id.table_id 1090 table_spec = 'samples.weather_stations' 1091 # [END model_bigqueryio_table_spec_without_project] 1092 1093 # [START model_bigqueryio_table_spec_object] 1094 from apache_beam.io.gcp.internal.clients import bigquery 1095 1096 table_spec = bigquery.TableReference( 1097 projectId='clouddataflow-readonly', 1098 datasetId='samples', 1099 tableId='weather_stations') 1100 # [END model_bigqueryio_table_spec_object] 1101 1102 # [START model_bigqueryio_data_types] 1103 bigquery_data = [{ 1104 'string': 'abc', 1105 'bytes': base64.b64encode(b'\xab\xac'), 1106 'integer': 5, 1107 'float': 0.5, 1108 'numeric': Decimal('5'), 1109 'boolean': True, 1110 'timestamp': '2018-12-31 12:44:31.744957 UTC', 1111 'date': '2018-12-31', 1112 'time': '12:44:31', 1113 'datetime': '2018-12-31T12:44:31', 1114 'geography': 'POINT(30 10)' 1115 }] 1116 # [END model_bigqueryio_data_types] 1117 1118 # [START model_bigqueryio_read_table] 1119 max_temperatures = ( 1120 pipeline 1121 | 'ReadTable' >> beam.io.ReadFromBigQuery(table=table_spec) 1122 # Each row is a dictionary where the keys are the BigQuery columns 1123 | beam.Map(lambda elem: elem['max_temperature'])) 1124 # [END model_bigqueryio_read_table] 1125 1126 # [START model_bigqueryio_read_query] 1127 max_temperatures = ( 1128 pipeline 1129 | 'QueryTable' >> beam.io.ReadFromBigQuery( 1130 query='SELECT max_temperature FROM '\ 1131 '[clouddataflow-readonly:samples.weather_stations]') 1132 # Each row is a dictionary where the keys are the BigQuery columns 1133 | beam.Map(lambda elem: elem['max_temperature'])) 1134 # [END model_bigqueryio_read_query] 1135 1136 # [START model_bigqueryio_read_query_std_sql] 1137 max_temperatures = ( 1138 pipeline 1139 | 'QueryTableStdSQL' >> beam.io.ReadFromBigQuery( 1140 query='SELECT max_temperature FROM '\ 1141 '`clouddataflow-readonly.samples.weather_stations`', 1142 use_standard_sql=True) 1143 # Each row is a dictionary where the keys are the BigQuery columns 1144 | beam.Map(lambda elem: elem['max_temperature'])) 1145 # [END model_bigqueryio_read_query_std_sql] 1146 1147 # [START model_bigqueryio_schema] 1148 # column_name:BIGQUERY_TYPE, ... 1149 table_schema = 'source:STRING, quote:STRING' 1150 # [END model_bigqueryio_schema] 1151 1152 # [START model_bigqueryio_schema_object] 1153 table_schema = { 1154 'fields': [{ 1155 'name': 'source', 'type': 'STRING', 'mode': 'NULLABLE' 1156 }, { 1157 'name': 'quote', 'type': 'STRING', 'mode': 'REQUIRED' 1158 }] 1159 } 1160 # [END model_bigqueryio_schema_object] 1161 1162 if write_project and write_dataset and write_table: 1163 table_spec = '{}:{}.{}'.format(write_project, write_dataset, write_table) 1164 1165 # [START model_bigqueryio_write_input] 1166 quotes = pipeline | beam.Create([ 1167 { 1168 'source': 'Mahatma Gandhi', 'quote': 'My life is my message.' 1169 }, 1170 { 1171 'source': 'Yoda', 'quote': "Do, or do not. There is no 'try'." 1172 }, 1173 ]) 1174 # [END model_bigqueryio_write_input] 1175 1176 # [START model_bigqueryio_write] 1177 quotes | beam.io.WriteToBigQuery( 1178 table_spec, 1179 schema=table_schema, 1180 write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, 1181 create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED) 1182 # [END model_bigqueryio_write] 1183 1184 # [START model_bigqueryio_write_dynamic_destinations] 1185 fictional_characters_view = beam.pvalue.AsDict( 1186 pipeline | 'CreateCharacters' >> beam.Create([('Yoda', True), 1187 ('Obi Wan Kenobi', True)])) 1188 1189 def table_fn(element, fictional_characters): 1190 if element in fictional_characters: 1191 return 'my_dataset.fictional_quotes' 1192 else: 1193 return 'my_dataset.real_quotes' 1194 1195 quotes | 'WriteWithDynamicDestination' >> beam.io.WriteToBigQuery( 1196 table_fn, 1197 schema=table_schema, 1198 table_side_inputs=(fictional_characters_view, ), 1199 write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, 1200 create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED) 1201 # [END model_bigqueryio_write_dynamic_destinations] 1202 1203 # [START model_bigqueryio_time_partitioning] 1204 quotes | 'WriteWithTimePartitioning' >> beam.io.WriteToBigQuery( 1205 table_spec, 1206 schema=table_schema, 1207 write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, 1208 create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, 1209 additional_bq_parameters={'timePartitioning': { 1210 'type': 'HOUR' 1211 }}) 1212 # [END model_bigqueryio_time_partitioning] 1213 1214 1215 def model_composite_transform_example(contents, output_path): 1216 """Example of a composite transform. 1217 1218 To declare a composite transform, define a subclass of PTransform. 1219 1220 To override the apply method, define a method "apply" that 1221 takes a PCollection as its only parameter and returns a PCollection. 1222 """ 1223 import re 1224 1225 import apache_beam as beam 1226 1227 # [START composite_transform_example] 1228 # [START composite_ptransform_apply_method] 1229 # [START composite_ptransform_declare] 1230 class CountWords(beam.PTransform): 1231 # [END composite_ptransform_declare] 1232 1233 def expand(self, pcoll): 1234 return ( 1235 pcoll 1236 | beam.FlatMap(lambda x: re.findall(r'\w+', x)) 1237 | beam.combiners.Count.PerElement() 1238 | beam.Map(lambda word_c: '%s: %s' % (word_c[0], word_c[1]))) 1239 1240 # [END composite_ptransform_apply_method] 1241 # [END composite_transform_example] 1242 1243 with TestPipeline() as pipeline: # Use TestPipeline for testing. 1244 ( 1245 pipeline 1246 | beam.Create(contents) 1247 | CountWords() 1248 | beam.io.WriteToText(output_path)) 1249 1250 1251 def model_multiple_pcollections_flatten(contents, output_path): 1252 """Merging a PCollection with Flatten.""" 1253 some_hash_fn = lambda s: ord(s[0]) 1254 partition_fn = lambda element, partitions: some_hash_fn(element) % partitions 1255 import apache_beam as beam 1256 with TestPipeline() as pipeline: # Use TestPipeline for testing. 1257 1258 # Partition into deciles 1259 partitioned = pipeline | beam.Create(contents) | beam.Partition( 1260 partition_fn, 3) 1261 pcoll1 = partitioned[0] 1262 pcoll2 = partitioned[1] 1263 pcoll3 = partitioned[2] 1264 1265 # Flatten them back into 1 1266 1267 # A collection of PCollection objects can be represented simply 1268 # as a tuple (or list) of PCollections. 1269 # (The SDK for Python has no separate type to store multiple 1270 # PCollection objects, whether containing the same or different 1271 # types.) 1272 # [START model_multiple_pcollections_flatten] 1273 merged = ( 1274 (pcoll1, pcoll2, pcoll3) 1275 # A list of tuples can be "piped" directly into a Flatten transform. 1276 | beam.Flatten()) 1277 # [END model_multiple_pcollections_flatten] 1278 merged | beam.io.WriteToText(output_path) 1279 1280 1281 def model_multiple_pcollections_partition(contents, output_path): 1282 """Splitting a PCollection with Partition.""" 1283 some_hash_fn = lambda s: ord(s[0]) 1284 1285 def get_percentile(i): 1286 """Assume i in [0,100).""" 1287 return i 1288 1289 import apache_beam as beam 1290 with TestPipeline() as pipeline: # Use TestPipeline for testing. 1291 1292 students = pipeline | beam.Create(contents) 1293 1294 # [START model_multiple_pcollections_partition] 1295 def partition_fn(student, num_partitions): 1296 return int(get_percentile(student) * num_partitions / 100) 1297 1298 by_decile = students | beam.Partition(partition_fn, 10) 1299 # [END model_multiple_pcollections_partition] 1300 # [START model_multiple_pcollections_partition_40th] 1301 fortieth_percentile = by_decile[4] 1302 # [END model_multiple_pcollections_partition_40th] 1303 1304 ([by_decile[d] for d in range(10) if d != 4] + [fortieth_percentile] 1305 | beam.Flatten() 1306 | beam.io.WriteToText(output_path)) 1307 1308 1309 def model_group_by_key(contents, output_path): 1310 """Applying a GroupByKey Transform.""" 1311 import re 1312 1313 import apache_beam as beam 1314 with TestPipeline() as pipeline: # Use TestPipeline for testing. 1315 1316 def count_ones(word_ones): 1317 (word, ones) = word_ones 1318 return (word, sum(ones)) 1319 1320 words_and_counts = ( 1321 pipeline 1322 | beam.Create(contents) 1323 | beam.FlatMap(lambda x: re.findall(r'\w+', x)) 1324 | 'one word' >> beam.Map(lambda w: (w, 1))) 1325 # GroupByKey accepts a PCollection of (w, 1) and 1326 # outputs a PCollection of (w, (1, 1, ...)). 1327 # (A key/value pair is just a tuple in Python.) 1328 # This is a somewhat forced example, since one could 1329 # simply use beam.combiners.Count.PerElement here. 1330 # [START model_group_by_key_transform] 1331 grouped_words = words_and_counts | beam.GroupByKey() 1332 # [END model_group_by_key_transform] 1333 ( 1334 grouped_words 1335 | 'count words' >> beam.Map(count_ones) 1336 | beam.io.WriteToText(output_path)) 1337 1338 1339 def model_co_group_by_key_tuple(emails, phones, output_path): 1340 """Applying a CoGroupByKey Transform to a tuple.""" 1341 import apache_beam as beam 1342 # [START model_group_by_key_cogroupbykey_tuple] 1343 # The result PCollection contains one key-value element for each key in the 1344 # input PCollections. The key of the pair will be the key from the input and 1345 # the value will be a dictionary with two entries: 'emails' - an iterable of 1346 # all values for the current key in the emails PCollection and 'phones': an 1347 # iterable of all values for the current key in the phones PCollection. 1348 results = ({'emails': emails, 'phones': phones} | beam.CoGroupByKey()) 1349 1350 def join_info(name_info): 1351 (name, info) = name_info 1352 return '%s; %s; %s' %\ 1353 (name, sorted(info['emails']), sorted(info['phones'])) 1354 1355 contact_lines = results | beam.Map(join_info) 1356 # [END model_group_by_key_cogroupbykey_tuple] 1357 contact_lines | beam.io.WriteToText(output_path) 1358 1359 1360 def model_join_using_side_inputs( 1361 name_list, email_list, phone_list, output_path): 1362 """Joining PCollections using side inputs.""" 1363 1364 import apache_beam as beam 1365 from apache_beam.pvalue import AsIter 1366 1367 with TestPipeline() as pipeline: # Use TestPipeline for testing. 1368 # [START model_join_using_side_inputs] 1369 # This code performs a join by receiving the set of names as an input and 1370 # passing PCollections that contain emails and phone numbers as side inputs 1371 # instead of using CoGroupByKey. 1372 names = pipeline | 'names' >> beam.Create(name_list) 1373 emails = pipeline | 'email' >> beam.Create(email_list) 1374 phones = pipeline | 'phone' >> beam.Create(phone_list) 1375 1376 def join_info(name, emails, phone_numbers): 1377 filtered_emails = [] 1378 for name_in_list, email in emails: 1379 if name_in_list == name: 1380 filtered_emails.append(email) 1381 1382 filtered_phone_numbers = [] 1383 for name_in_list, phone_number in phone_numbers: 1384 if name_in_list == name: 1385 filtered_phone_numbers.append(phone_number) 1386 1387 return '; '.join([ 1388 '%s' % name, 1389 '%s' % ','.join(filtered_emails), 1390 '%s' % ','.join(filtered_phone_numbers) 1391 ]) 1392 1393 contact_lines = names | 'CreateContacts' >> beam.core.Map( 1394 join_info, AsIter(emails), AsIter(phones)) 1395 # [END model_join_using_side_inputs] 1396 contact_lines | beam.io.WriteToText(output_path) 1397 1398 1399 # [START model_library_transforms_keys] 1400 class Keys(beam.PTransform): 1401 def expand(self, pcoll): 1402 return pcoll | 'Keys' >> beam.Map(lambda k_v: k_v[0]) 1403 1404 1405 # [END model_library_transforms_keys] 1406 # pylint: enable=invalid-name 1407 1408 1409 # [START model_library_transforms_count] 1410 class Count(beam.PTransform): 1411 def expand(self, pcoll): 1412 return ( 1413 pcoll 1414 | 'PairWithOne' >> beam.Map(lambda v: (v, 1)) 1415 | beam.CombinePerKey(sum)) 1416 1417 1418 # [END model_library_transforms_count] 1419 1420 1421 def file_process_pattern_access_metadata(): 1422 1423 import apache_beam as beam 1424 from apache_beam.io import fileio 1425 1426 # [START FileProcessPatternAccessMetadataSnip1] 1427 with beam.Pipeline() as pipeline: 1428 readable_files = ( 1429 pipeline 1430 | fileio.MatchFiles('hdfs://path/to/*.txt') 1431 | fileio.ReadMatches() 1432 | beam.Reshuffle()) 1433 files_and_contents = ( 1434 readable_files 1435 | beam.Map(lambda x: (x.metadata.path, x.read_utf8()))) 1436 # [END FileProcessPatternAccessMetadataSnip1] 1437 1438 1439 def accessing_valueprovider_info_after_run(): 1440 # [START AccessingValueProviderInfoAfterRunSnip1] 1441 import logging 1442 1443 import apache_beam as beam 1444 from apache_beam.options.pipeline_options import PipelineOptions 1445 from apache_beam.options.value_provider import RuntimeValueProvider 1446 1447 class MyOptions(PipelineOptions): 1448 @classmethod 1449 def _add_argparse_args(cls, parser): 1450 parser.add_value_provider_argument('--string_value', type=str) 1451 1452 class LogValueProvidersFn(beam.DoFn): 1453 def __init__(self, string_vp): 1454 self.string_vp = string_vp 1455 1456 # Define the DoFn that logs the ValueProvider value. 1457 # The DoFn is called when creating the pipeline branch. 1458 # This example logs the ValueProvider value, but 1459 # you could store it by pushing it to an external database. 1460 def process(self, an_int): 1461 logging.info('The string_value is %s' % self.string_vp.get()) 1462 # Another option (where you don't need to pass the value at all) is: 1463 logging.info( 1464 'The string value is %s' % 1465 RuntimeValueProvider.get_value('string_value', str, '')) 1466 1467 beam_options = PipelineOptions() 1468 args = beam_options.view_as(MyOptions) 1469 1470 # Create pipeline. 1471 with beam.Pipeline(options=beam_options) as pipeline: 1472 1473 # Add a branch for logging the ValueProvider value. 1474 _ = ( 1475 pipeline 1476 | beam.Create([None]) 1477 | 'LogValueProvs' >> beam.ParDo(LogValueProvidersFn(args.string_value))) 1478 1479 # The main pipeline. 1480 result_pc = ( 1481 pipeline 1482 | "main_pc" >> beam.Create([1, 2, 3]) 1483 | beam.combiners.Sum.Globally()) 1484 1485 # [END AccessingValueProviderInfoAfterRunSnip1] 1486 1487 1488 def side_input_slow_update( 1489 src_file_pattern, 1490 first_timestamp, 1491 last_timestamp, 1492 interval, 1493 sample_main_input_elements, 1494 main_input_windowing_interval): 1495 # [START SideInputSlowUpdateSnip1] 1496 from apache_beam.transforms.periodicsequence import PeriodicImpulse 1497 from apache_beam.transforms.window import TimestampedValue 1498 from apache_beam.transforms import window 1499 1500 # from apache_beam.utils.timestamp import MAX_TIMESTAMP 1501 # last_timestamp = MAX_TIMESTAMP to go on indefninitely 1502 1503 # Any user-defined function. 1504 # cross join is used as an example. 1505 def cross_join(left, rights): 1506 for x in rights: 1507 yield (left, x) 1508 1509 # Create pipeline. 1510 pipeline = beam.Pipeline() 1511 side_input = ( 1512 pipeline 1513 | 'PeriodicImpulse' >> PeriodicImpulse( 1514 first_timestamp, last_timestamp, interval, True) 1515 | 'MapToFileName' >> beam.Map(lambda x: src_file_pattern + str(x)) 1516 | 'ReadFromFile' >> beam.io.ReadAllFromText()) 1517 1518 main_input = ( 1519 pipeline 1520 | 'MpImpulse' >> beam.Create(sample_main_input_elements) 1521 | 1522 'MapMpToTimestamped' >> beam.Map(lambda src: TimestampedValue(src, src)) 1523 | 'WindowMpInto' >> beam.WindowInto( 1524 window.FixedWindows(main_input_windowing_interval))) 1525 1526 result = ( 1527 main_input 1528 | 'ApplyCrossJoin' >> beam.FlatMap( 1529 cross_join, rights=beam.pvalue.AsIter(side_input))) 1530 # [END SideInputSlowUpdateSnip1] 1531 1532 return pipeline, result 1533 1534 1535 def bigqueryio_deadletter(): 1536 # [START BigQueryIODeadLetter] 1537 1538 # Create pipeline. 1539 schema = ({'fields': [{'name': 'a', 'type': 'STRING', 'mode': 'REQUIRED'}]}) 1540 1541 pipeline = beam.Pipeline() 1542 1543 errors = ( 1544 pipeline | 'Data' >> beam.Create([1, 2]) 1545 | 'CreateBrokenData' >> 1546 beam.Map(lambda src: {'a': src} if src == 2 else {'a': None}) 1547 | 'WriteToBigQuery' >> beam.io.WriteToBigQuery( 1548 "<Your Project:Test.dummy_a_table", 1549 schema=schema, 1550 insert_retry_strategy='RETRY_ON_TRANSIENT_ERROR', 1551 create_disposition='CREATE_IF_NEEDED', 1552 write_disposition='WRITE_APPEND')) 1553 result = ( 1554 errors['FailedRows'] 1555 | 'PrintErrors' >> 1556 beam.FlatMap(lambda err: print("Error Found {}".format(err)))) 1557 # [END BigQueryIODeadLetter] 1558 1559 return result 1560 1561 1562 def extract_sentiments(response): 1563 # [START nlp_extract_sentiments] 1564 return { 1565 'sentences': [{ 1566 sentence.text.content: sentence.sentiment.score 1567 } for sentence in response.sentences], 1568 'document_sentiment': response.document_sentiment.score, 1569 } 1570 # [END nlp_extract_sentiments] 1571 1572 1573 def extract_entities(response): 1574 # [START nlp_extract_entities] 1575 return [{ 1576 'name': entity.name, 1577 'type': nlp.enums.Entity.Type(entity.type).name, 1578 } for entity in response.entities] 1579 # [END nlp_extract_entities] 1580 1581 1582 def analyze_dependency_tree(response): 1583 # [START analyze_dependency_tree] 1584 from collections import defaultdict 1585 adjacency_lists = [] 1586 1587 index = 0 1588 for sentence in response.sentences: 1589 adjacency_list = defaultdict(list) 1590 sentence_begin = sentence.text.begin_offset 1591 sentence_end = sentence_begin + len(sentence.text.content) - 1 1592 1593 while index < len(response.tokens) and \ 1594 response.tokens[index].text.begin_offset <= sentence_end: 1595 token = response.tokens[index] 1596 head_token_index = token.dependency_edge.head_token_index 1597 head_token_text = response.tokens[head_token_index].text.content 1598 adjacency_list[head_token_text].append(token.text.content) 1599 index += 1 1600 adjacency_lists.append(adjacency_list) 1601 # [END analyze_dependency_tree] 1602 1603 return adjacency_lists 1604 1605 1606 def nlp_analyze_text(): 1607 # [START nlp_analyze_text] 1608 features = nlp.types.AnnotateTextRequest.Features( 1609 extract_entities=True, 1610 extract_document_sentiment=True, 1611 extract_entity_sentiment=True, 1612 extract_syntax=True, 1613 ) 1614 1615 with beam.Pipeline() as pipeline: 1616 responses = ( 1617 pipeline 1618 | beam.Create([ 1619 'My experience so far has been fantastic! ' 1620 'I\'d really recommend this product.' 1621 ]) 1622 | beam.Map(lambda x: nlp.Document(x, type='PLAIN_TEXT')) 1623 | nlp.AnnotateText(features)) 1624 1625 _ = ( 1626 responses 1627 | beam.Map(extract_sentiments) 1628 | 'Parse sentiments to JSON' >> beam.Map(json.dumps) 1629 | 'Write sentiments' >> beam.io.WriteToText('sentiments.txt')) 1630 1631 _ = ( 1632 responses 1633 | beam.Map(extract_entities) 1634 | 'Parse entities to JSON' >> beam.Map(json.dumps) 1635 | 'Write entities' >> beam.io.WriteToText('entities.txt')) 1636 1637 _ = ( 1638 responses 1639 | beam.Map(analyze_dependency_tree) 1640 | 'Parse adjacency list to JSON' >> beam.Map(json.dumps) 1641 | 'Write adjacency list' >> beam.io.WriteToText('adjancency_list.txt')) 1642 # [END nlp_analyze_text] 1643 1644 1645 def sdf_basic_example(): 1646 import os 1647 from apache_beam.io.restriction_trackers import OffsetRange 1648 read_next_record = None 1649 1650 # [START SDF_BasicExample] 1651 class FileToWordsRestrictionProvider(beam.transforms.core.RestrictionProvider 1652 ): 1653 def initial_restriction(self, file_name): 1654 return OffsetRange(0, os.stat(file_name).st_size) 1655 1656 def create_tracker(self, restriction): 1657 return beam.io.restriction_trackers.OffsetRestrictionTracker() 1658 1659 class FileToWordsFn(beam.DoFn): 1660 def process( 1661 self, 1662 file_name, 1663 # Alternatively, we can let FileToWordsFn itself inherit from 1664 # RestrictionProvider, implement the required methods and let 1665 # tracker=beam.DoFn.RestrictionParam() which will use self as 1666 # the provider. 1667 tracker=beam.DoFn.RestrictionParam(FileToWordsRestrictionProvider())): 1668 with open(file_name) as file_handle: 1669 file_handle.seek(tracker.current_restriction.start()) 1670 while tracker.try_claim(file_handle.tell()): 1671 yield read_next_record(file_handle) 1672 1673 # Providing the coder is only necessary if it can not be inferred at 1674 # runtime. 1675 def restriction_coder(self): 1676 return ... 1677 1678 # [END SDF_BasicExample] 1679 1680 1681 def sdf_basic_example_with_splitting(): 1682 from apache_beam.io.restriction_trackers import OffsetRange 1683 1684 # [START SDF_BasicExampleWithSplitting] 1685 class FileToWordsRestrictionProvider(beam.transforms.core.RestrictionProvider 1686 ): 1687 def split(self, file_name, restriction): 1688 # Compute and output 64 MiB size ranges to process in parallel 1689 split_size = 64 * (1 << 20) 1690 i = restriction.start 1691 while i < restriction.end - split_size: 1692 yield OffsetRange(i, i + split_size) 1693 i += split_size 1694 yield OffsetRange(i, restriction.end) 1695 1696 # [END SDF_BasicExampleWithSplitting] 1697 1698 1699 def sdf_sdk_initiated_checkpointing(): 1700 timestamp = None 1701 external_service = None 1702 1703 class MyRestrictionProvider(object): 1704 pass 1705 1706 # [START SDF_UserInitiatedCheckpoint] 1707 class MySplittableDoFn(beam.DoFn): 1708 def process( 1709 self, 1710 element, 1711 restriction_tracker=beam.DoFn.RestrictionParam( 1712 MyRestrictionProvider())): 1713 current_position = restriction_tracker.current_restriction.start() 1714 while True: 1715 # Pull records from an external service. 1716 try: 1717 records = external_service.fetch(current_position) 1718 if records.empty(): 1719 # Set a shorter delay in case we are being throttled. 1720 restriction_tracker.defer_remainder(timestamp.Duration(second=10)) 1721 return 1722 for record in records: 1723 if restriction_tracker.try_claim(record.position): 1724 current_position = record.position 1725 yield record 1726 else: 1727 return 1728 except TimeoutError: 1729 # Set a longer delay in case we are being throttled. 1730 restriction_tracker.defer_remainder(timestamp.Duration(seconds=60)) 1731 return 1732 1733 # [END SDF_UserInitiatedCheckpoint] 1734 1735 1736 def sdf_get_size(): 1737 # [START SDF_GetSize] 1738 # The RestrictionProvider is responsible for calculating the size of given 1739 # restriction. 1740 class MyRestrictionProvider(beam.transforms.core.RestrictionProvider): 1741 def restriction_size(self, file_name, restriction): 1742 weight = 2 if "expensiveRecords" in file_name else 1 1743 return restriction.size() * weight 1744 1745 # [END SDF_GetSize] 1746 1747 1748 def sdf_bad_try_claim_loop(): 1749 class FileToWordsRestrictionProvider(object): 1750 pass 1751 1752 read_next_record = None 1753 1754 # [START SDF_BadTryClaimLoop] 1755 class BadTryClaimLoop(beam.DoFn): 1756 def process( 1757 self, 1758 file_name, 1759 tracker=beam.DoFn.RestrictionParam(FileToWordsRestrictionProvider())): 1760 with open(file_name) as file_handle: 1761 file_handle.seek(tracker.current_restriction.start()) 1762 # The restriction tracker can be modified by another thread in parallel 1763 # so storing state locally is ill advised. 1764 end = tracker.current_restriction.end() 1765 while file_handle.tell() < end: 1766 # Only after successfully claiming should we produce any output and/or 1767 # perform side effects. 1768 tracker.try_claim(file_handle.tell()) 1769 yield read_next_record(file_handle) 1770 1771 # [END SDF_BadTryClaimLoop] 1772 1773 1774 def sdf_custom_watermark_estimator(): 1775 from apache_beam.io.iobase import WatermarkEstimator 1776 from apache_beam.transforms.core import WatermarkEstimatorProvider 1777 current_watermark = None 1778 1779 class MyRestrictionProvider(object): 1780 pass 1781 1782 # [START SDF_CustomWatermarkEstimator] 1783 # (Optional) Define a custom watermark state type to save information between 1784 # bundle processing rounds. 1785 class MyCustomerWatermarkEstimatorState(object): 1786 def __init__(self, element, restriction): 1787 # Store data necessary for future watermark computations 1788 pass 1789 1790 # Define a WatermarkEstimator 1791 class MyCustomWatermarkEstimator(WatermarkEstimator): 1792 def __init__(self, estimator_state): 1793 self.state = estimator_state 1794 1795 def observe_timestamp(self, timestamp): 1796 # Will be invoked on each output from the SDF 1797 pass 1798 1799 def current_watermark(self): 1800 # Return a monotonically increasing value 1801 return current_watermark 1802 1803 def get_estimator_state(self): 1804 # Return state to resume future watermark estimation after a 1805 # checkpoint/split 1806 return self.state 1807 1808 # Then, a WatermarkEstimatorProvider needs to be created for this 1809 # WatermarkEstimator 1810 class MyWatermarkEstimatorProvider(WatermarkEstimatorProvider): 1811 def initial_estimator_state(self, element, restriction): 1812 return MyCustomerWatermarkEstimatorState(element, restriction) 1813 1814 def create_watermark_estimator(self, estimator_state): 1815 return MyCustomWatermarkEstimator(estimator_state) 1816 1817 # Finally, define the SDF using your estimator. 1818 class MySplittableDoFn(beam.DoFn): 1819 def process( 1820 self, 1821 element, 1822 restriction_tracker=beam.DoFn.RestrictionParam(MyRestrictionProvider()), 1823 watermark_estimator=beam.DoFn.WatermarkEstimatorParam( 1824 MyWatermarkEstimatorProvider())): 1825 # The current watermark can be inspected. 1826 watermark_estimator.current_watermark() 1827 1828 # [END SDF_CustomWatermarkEstimator] 1829 1830 1831 def sdf_truncate(): 1832 # [START SDF_Truncate] 1833 class MyRestrictionProvider(beam.transforms.core.RestrictionProvider): 1834 def truncate(self, file_name, restriction): 1835 if "optional" in file_name: 1836 # Skip optional files 1837 return None 1838 return restriction 1839 1840 # [END SDF_Truncate] 1841 1842 1843 def bundle_finalize(): 1844 my_callback_func = None 1845 1846 # [START BundleFinalize] 1847 class MySplittableDoFn(beam.DoFn): 1848 def process(self, element, bundle_finalizer=beam.DoFn.BundleFinalizerParam): 1849 # ... produce output ... 1850 1851 # Register callback function for this bundle that performs the side 1852 # effect. 1853 bundle_finalizer.register(my_callback_func) 1854 1855 # [END BundleFinalize]