github.com/apache/beam/sdks/v2@v2.48.2/go/examples/snippets/04transforms.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one or more 2 // contributor license agreements. See the NOTICE file distributed with 3 // this work for additional information regarding copyright ownership. 4 // The ASF licenses this file to You under the Apache License, Version 2.0 5 // (the "License"); you may not use this file except in compliance with 6 // the License. You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 package snippets 17 18 import ( 19 "fmt" 20 "math" 21 "reflect" 22 "sort" 23 "strings" 24 "time" 25 26 "github.com/apache/beam/sdks/v2/go/pkg/beam" 27 "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/window" 28 "github.com/apache/beam/sdks/v2/go/pkg/beam/core/sdf" 29 "github.com/apache/beam/sdks/v2/go/pkg/beam/core/state" 30 "github.com/apache/beam/sdks/v2/go/pkg/beam/core/typex" 31 "github.com/apache/beam/sdks/v2/go/pkg/beam/io/rtrackers/offsetrange" 32 "github.com/apache/beam/sdks/v2/go/pkg/beam/register" 33 "github.com/apache/beam/sdks/v2/go/pkg/beam/transforms/stats" 34 ) 35 36 // [START model_pardo_pardo] 37 38 // ComputeWordLengthFn is the DoFn to perform on each element in the input PCollection. 39 type ComputeWordLengthFn struct{} 40 41 // ProcessElement is the method to execute for each element. 42 func (fn *ComputeWordLengthFn) ProcessElement(word string, emit func(int)) { 43 emit(len(word)) 44 } 45 46 // DoFns must be registered with beam. 47 func init() { 48 beam.RegisterType(reflect.TypeOf((*ComputeWordLengthFn)(nil))) 49 // 2 inputs and 0 outputs => DoFn2x0 50 // 1 input => Emitter1 51 // Input/output types are included in order in the brackets 52 register.DoFn2x0[string, func(int)](&ComputeWordLengthFn{}) 53 register.Emitter1[int]() 54 } 55 56 // [END model_pardo_pardo] 57 58 // applyWordLen applies ComputeWordLengthFn to words, which must be 59 // a PCollection<string> 60 func applyWordLen(s beam.Scope, words beam.PCollection) beam.PCollection { 61 // [START model_pardo_apply] 62 wordLengths := beam.ParDo(s, &ComputeWordLengthFn{}, words) 63 // [END model_pardo_apply] 64 return wordLengths 65 } 66 67 func applyWordLenAnon(s beam.Scope, words beam.PCollection) beam.PCollection { 68 // [START model_pardo_apply_anon] 69 // Apply an anonymous function as a DoFn PCollection words. 70 // Save the result as the PCollection wordLengths. 71 wordLengths := beam.ParDo(s, func(word string) int { 72 return len(word) 73 }, words) 74 // [END model_pardo_apply_anon] 75 return wordLengths 76 } 77 78 func applyGbk(s beam.Scope, input []stringPair) beam.PCollection { 79 // [START groupbykey] 80 // CreateAndSplit creates and returns a PCollection with <K,V> 81 // from an input slice of stringPair (struct with K, V string fields). 82 pairs := CreateAndSplit(s, input) 83 keyed := beam.GroupByKey(s, pairs) 84 // [END groupbykey] 85 return keyed 86 } 87 88 // [START cogroupbykey_input_helpers] 89 90 type stringPair struct { 91 K, V string 92 } 93 94 func splitStringPair(e stringPair) (string, string) { 95 return e.K, e.V 96 } 97 98 func init() { 99 // Register DoFn. 100 register.Function1x2(splitStringPair) 101 } 102 103 // CreateAndSplit is a helper function that creates 104 func CreateAndSplit(s beam.Scope, input []stringPair) beam.PCollection { 105 initial := beam.CreateList(s, input) 106 return beam.ParDo(s, splitStringPair, initial) 107 } 108 109 // [END cogroupbykey_input_helpers] 110 111 type splittableDoFn struct{} 112 113 type weDoFn struct{} 114 115 // [START bundlefinalization_simplecallback] 116 117 func (fn *splittableDoFn) ProcessElement(bf beam.BundleFinalization, rt *sdf.LockRTracker, element string) { 118 // ... produce output ... 119 120 bf.RegisterCallback(5*time.Minute, func() error { 121 // ... perform a side effect ... 122 123 return nil 124 }) 125 } 126 127 // [END bundlefinalization_simplecallback] 128 129 // [START watermarkestimation_customestimator] 130 131 // WatermarkState is a custom type.` 132 // 133 // It is optional to write your own state type when making a custom estimator. 134 type WatermarkState struct { 135 Watermark time.Time 136 } 137 138 // CustomWatermarkEstimator is a custom watermark estimator. 139 // You may use any type here, including some of Beam's built in watermark estimator types, 140 // e.g. sdf.WallTimeWatermarkEstimator, sdf.TimestampObservingWatermarkEstimator, and sdf.ManualWatermarkEstimator 141 type CustomWatermarkEstimator struct { 142 state WatermarkState 143 } 144 145 // CurrentWatermark returns the current watermark and is invoked on DoFn splits and self-checkpoints. 146 // Watermark estimators must implement CurrentWatermark() time.Time 147 func (e *CustomWatermarkEstimator) CurrentWatermark() time.Time { 148 return e.state.Watermark 149 } 150 151 // ObserveTimestamp is called on the output timestamps of all 152 // emitted elements to update the watermark. It is optional 153 func (e *CustomWatermarkEstimator) ObserveTimestamp(ts time.Time) { 154 e.state.Watermark = ts 155 } 156 157 // InitialWatermarkEstimatorState defines an initial state used to initialize the watermark 158 // estimator. It is optional. If this is not defined, WatermarkEstimatorState may not be 159 // defined and CreateWatermarkEstimator must not take in parameters. 160 func (fn *weDoFn) InitialWatermarkEstimatorState(et beam.EventTime, rest offsetrange.Restriction, element string) WatermarkState { 161 // Return some watermark state 162 return WatermarkState{Watermark: time.Now()} 163 } 164 165 // CreateWatermarkEstimator creates the watermark estimator used by this Splittable DoFn. 166 // Must take in a state parameter if InitialWatermarkEstimatorState is defined, otherwise takes no parameters. 167 func (fn *weDoFn) CreateWatermarkEstimator(initialState WatermarkState) *CustomWatermarkEstimator { 168 return &CustomWatermarkEstimator{state: initialState} 169 } 170 171 // WatermarkEstimatorState returns the state used to resume future watermark estimation 172 // after a checkpoint/split. It is required if InitialWatermarkEstimatorState is defined, 173 // otherwise it must not be defined. 174 func (fn *weDoFn) WatermarkEstimatorState(e *CustomWatermarkEstimator) WatermarkState { 175 return e.state 176 } 177 178 // ProcessElement is the method to execute for each element. 179 // It can optionally take in a watermark estimator. 180 func (fn *weDoFn) ProcessElement(e *CustomWatermarkEstimator, element string) { 181 // ... 182 e.state.Watermark = time.Now() 183 } 184 185 // [END watermarkestimation_customestimator] 186 187 // [START sdf_truncate] 188 189 // TruncateRestriction is a transform that is triggered when pipeline starts to drain. It helps to finish a 190 // pipeline quicker by truncating the restriction. 191 func (fn *splittableDoFn) TruncateRestriction(rt *sdf.LockRTracker, element string) offsetrange.Restriction { 192 start := rt.GetRestriction().(offsetrange.Restriction).Start 193 prevEnd := rt.GetRestriction().(offsetrange.Restriction).End 194 // truncate the restriction by half. 195 newEnd := prevEnd / 2 196 return offsetrange.Restriction{ 197 Start: start, 198 End: newEnd, 199 } 200 } 201 202 // [END sdf_truncate] 203 204 // [START cogroupbykey_output_helpers] 205 206 func formatCoGBKResults(key string, emailIter, phoneIter func(*string) bool) string { 207 var s string 208 var emails, phones []string 209 for emailIter(&s) { 210 emails = append(emails, s) 211 } 212 for phoneIter(&s) { 213 phones = append(phones, s) 214 } 215 // Values have no guaranteed order, sort for deterministic output. 216 sort.Strings(emails) 217 sort.Strings(phones) 218 return fmt.Sprintf("%s; %s; %s", key, formatStringIter(emails), formatStringIter(phones)) 219 } 220 221 func init() { 222 register.Function3x1(formatCoGBKResults) 223 // 1 input of type string => Iter1[string] 224 register.Iter1[string]() 225 } 226 227 // [END cogroupbykey_output_helpers] 228 229 func formatStringIter(vs []string) string { 230 var b strings.Builder 231 b.WriteRune('[') 232 for i, v := range vs { 233 b.WriteRune('\'') 234 b.WriteString(v) 235 b.WriteRune('\'') 236 if i < len(vs)-1 { 237 b.WriteString(", ") 238 } 239 } 240 b.WriteRune(']') 241 return b.String() 242 } 243 244 func coGBKExample(s beam.Scope) beam.PCollection { 245 // [START cogroupbykey_inputs] 246 var emailSlice = []stringPair{ 247 {"amy", "amy@example.com"}, 248 {"carl", "carl@example.com"}, 249 {"julia", "julia@example.com"}, 250 {"carl", "carl@email.com"}, 251 } 252 253 var phoneSlice = []stringPair{ 254 {"amy", "111-222-3333"}, 255 {"james", "222-333-4444"}, 256 {"amy", "333-444-5555"}, 257 {"carl", "444-555-6666"}, 258 } 259 emails := CreateAndSplit(s.Scope("CreateEmails"), emailSlice) 260 phones := CreateAndSplit(s.Scope("CreatePhones"), phoneSlice) 261 // [END cogroupbykey_inputs] 262 263 // [START cogroupbykey_outputs] 264 results := beam.CoGroupByKey(s, emails, phones) 265 266 contactLines := beam.ParDo(s, formatCoGBKResults, results) 267 // [END cogroupbykey_outputs] 268 269 return contactLines 270 } 271 272 // [START combine_simple_sum] 273 func sumInts(a, v int) int { 274 return a + v 275 } 276 277 func init() { 278 register.Function2x1(sumInts) 279 } 280 281 func globallySumInts(s beam.Scope, ints beam.PCollection) beam.PCollection { 282 return beam.Combine(s, sumInts, ints) 283 } 284 285 type boundedSum struct { 286 Bound int 287 } 288 289 func (fn *boundedSum) MergeAccumulators(a, v int) int { 290 sum := a + v 291 if fn.Bound > 0 && sum > fn.Bound { 292 return fn.Bound 293 } 294 return sum 295 } 296 297 func init() { 298 register.Combiner1[int](&boundedSum{}) 299 } 300 301 func globallyBoundedSumInts(s beam.Scope, bound int, ints beam.PCollection) beam.PCollection { 302 return beam.Combine(s, &boundedSum{Bound: bound}, ints) 303 } 304 305 // [END combine_simple_sum] 306 307 // [START combine_custom_average] 308 309 type averageFn struct{} 310 311 type averageAccum struct { 312 Count, Sum int 313 } 314 315 func (fn *averageFn) CreateAccumulator() averageAccum { 316 return averageAccum{0, 0} 317 } 318 319 func (fn *averageFn) AddInput(a averageAccum, v int) averageAccum { 320 return averageAccum{Count: a.Count + 1, Sum: a.Sum + v} 321 } 322 323 func (fn *averageFn) MergeAccumulators(a, v averageAccum) averageAccum { 324 return averageAccum{Count: a.Count + v.Count, Sum: a.Sum + v.Sum} 325 } 326 327 func (fn *averageFn) ExtractOutput(a averageAccum) float64 { 328 if a.Count == 0 { 329 return math.NaN() 330 } 331 return float64(a.Sum) / float64(a.Count) 332 } 333 334 func init() { 335 register.Combiner3[averageAccum, int, float64](&averageFn{}) 336 } 337 338 // [END combine_custom_average] 339 340 func globallyAverage(s beam.Scope, ints beam.PCollection) beam.PCollection { 341 // [START combine_global_average] 342 average := beam.Combine(s, &averageFn{}, ints) 343 // [END combine_global_average] 344 return average 345 } 346 347 func globallyAverageWithDefault(s beam.Scope, ints beam.PCollection) beam.PCollection { 348 // [START combine_global_with_default] 349 // Setting combine defaults has requires no helper function in the Go SDK. 350 average := beam.Combine(s, &averageFn{}, ints) 351 352 // To add a default value: 353 defaultValue := beam.Create(s, float64(0)) 354 avgWithDefault := beam.ParDo(s, func(d float64, iter func(*float64) bool) float64 { 355 var c float64 356 if iter(&c) { 357 // Side input has a value, so return it. 358 return c 359 } 360 // Otherwise, return the default 361 return d 362 }, defaultValue, beam.SideInput{Input: average}) 363 // [END combine_global_with_default] 364 return avgWithDefault 365 } 366 367 func perKeyAverage(s beam.Scope, playerAccuracies beam.PCollection) beam.PCollection { 368 // [START combine_per_key] 369 avgAccuracyPerPlayer := stats.MeanPerKey(s, playerAccuracies) 370 // [END combine_per_key] 371 return avgAccuracyPerPlayer 372 } 373 374 func applyFlatten(s beam.Scope, pcol1, pcol2, pcol3 beam.PCollection) beam.PCollection { 375 // [START model_multiple_pcollections_flatten] 376 merged := beam.Flatten(s, pcol1, pcol2, pcol3) 377 // [END model_multiple_pcollections_flatten] 378 return merged 379 } 380 381 type Student struct { 382 Percentile int 383 } 384 385 // [START model_multiple_pcollections_partition_fn] 386 387 func decileFn(student Student) int { 388 return int(float64(student.Percentile) / float64(10)) 389 } 390 391 func init() { 392 register.Function1x1(decileFn) 393 } 394 395 // [END model_multiple_pcollections_partition_fn] 396 397 // applyPartition returns the 40th percentile of students. 398 func applyPartition(s beam.Scope, students beam.PCollection) beam.PCollection { 399 // [START model_multiple_pcollections_partition] 400 // Partition returns a slice of PCollections 401 studentsByPercentile := beam.Partition(s, 10, decileFn, students) 402 // Each partition can be extracted by indexing into the slice. 403 fortiethPercentile := studentsByPercentile[4] 404 // [END model_multiple_pcollections_partition] 405 return fortiethPercentile 406 } 407 408 // [START model_pardo_side_input_dofn] 409 410 // filterWordsAbove is a DoFn that takes in a word, 411 // and a singleton side input iterator as of a length cut off 412 // and only emits words that are beneath that cut off. 413 // 414 // If the iterator has no elements, an error is returned, aborting processing. 415 func filterWordsAbove(word string, lengthCutOffIter func(*float64) bool, emitAboveCutoff func(string)) error { 416 var cutOff float64 417 ok := lengthCutOffIter(&cutOff) 418 if !ok { 419 return fmt.Errorf("no length cutoff provided") 420 } 421 if float64(len(word)) > cutOff { 422 emitAboveCutoff(word) 423 } 424 return nil 425 } 426 427 // filterWordsBelow is a DoFn that takes in a word, 428 // and a singleton side input of a length cut off 429 // and only emits words that are beneath that cut off. 430 // 431 // If the side input isn't a singleton, a runtime panic will occur. 432 func filterWordsBelow(word string, lengthCutOff float64, emitBelowCutoff func(string)) { 433 if float64(len(word)) <= lengthCutOff { 434 emitBelowCutoff(word) 435 } 436 } 437 438 func init() { 439 register.Function3x1(filterWordsAbove) 440 register.Function3x0(filterWordsBelow) 441 // 1 input of type string => Emitter1[string] 442 register.Emitter1[string]() 443 // 1 input of type float64 => Iter1[float64] 444 register.Iter1[float64]() 445 } 446 447 // [END model_pardo_side_input_dofn] 448 449 // addSideInput demonstrates passing a side input to a DoFn. 450 func addSideInput(s beam.Scope, words beam.PCollection) (beam.PCollection, beam.PCollection) { 451 wordLengths := applyWordLen(s, words) 452 453 // [START model_pardo_side_input] 454 // avgWordLength is a PCollection containing a single element, a singleton. 455 avgWordLength := stats.Mean(s, wordLengths) 456 457 // Side inputs are added as with the beam.SideInput option to beam.ParDo. 458 wordsAboveCutOff := beam.ParDo(s, filterWordsAbove, words, beam.SideInput{Input: avgWordLength}) 459 wordsBelowCutOff := beam.ParDo(s, filterWordsBelow, words, beam.SideInput{Input: avgWordLength}) 460 // [END model_pardo_side_input] 461 return wordsAboveCutOff, wordsBelowCutOff 462 } 463 464 // isMarkedWord is a dummy function. 465 func isMarkedWord(word string) bool { 466 return strings.HasPrefix(word, "MARKER") 467 } 468 469 // [START model_multiple_output_dofn] 470 471 // processWords is a DoFn that has 3 output PCollections. The emitter functions 472 // are matched in positional order to the PCollections returned by beam.ParDo3. 473 func processWords(word string, emitBelowCutoff, emitAboveCutoff, emitMarked func(string)) { 474 const cutOff = 5 475 if len(word) < cutOff { 476 emitBelowCutoff(word) 477 } else { 478 emitAboveCutoff(word) 479 } 480 if isMarkedWord(word) { 481 emitMarked(word) 482 } 483 } 484 485 // processWordsMixed demonstrates mixing an emitter, with a standard return. 486 // If a standard return is used, it will always be the first returned PCollection, 487 // followed in positional order by the emitter functions. 488 func processWordsMixed(word string, emitMarked func(string)) int { 489 if isMarkedWord(word) { 490 emitMarked(word) 491 } 492 return len(word) 493 } 494 495 func init() { 496 register.Function4x0(processWords) 497 register.Function2x1(processWordsMixed) 498 // 1 input of type string => Emitter1[string] 499 register.Emitter1[string]() 500 } 501 502 // [END model_multiple_output_dofn] 503 504 func applyMultipleOut(s beam.Scope, words beam.PCollection) (belows, aboves, markeds, lengths, mixedMarkeds beam.PCollection) { 505 // [START model_multiple_output] 506 // beam.ParDo3 returns PCollections in the same order as 507 // the emit function parameters in processWords. 508 below, above, marked := beam.ParDo3(s, processWords, words) 509 510 // processWordsMixed uses both a standard return and an emitter function. 511 // The standard return produces the first PCollection from beam.ParDo2, 512 // and the emitter produces the second PCollection. 513 length, mixedMarked := beam.ParDo2(s, processWordsMixed, words) 514 // [END model_multiple_output] 515 return below, above, marked, length, mixedMarked 516 } 517 518 // [START model_paneinfo] 519 520 func extractWordsFn(pn beam.PaneInfo, line string, emitWords func(string)) { 521 if pn.Timing == typex.PaneEarly || pn.Timing == typex.PaneOnTime { 522 // ... perform operation ... 523 } 524 if pn.Timing == typex.PaneLate { 525 // ... perform operation ... 526 } 527 if pn.IsFirst { 528 // ... perform operation ... 529 } 530 if pn.IsLast { 531 // ... perform operation ... 532 } 533 534 words := strings.Split(line, " ") 535 for _, w := range words { 536 emitWords(w) 537 } 538 } 539 540 // [END model_paneinfo] 541 542 func contains(s []string, e string) bool { 543 for _, a := range s { 544 if a == e { 545 return true 546 } 547 } 548 return false 549 } 550 551 // TODO(https://github.com/apache/beam/issues/22737): Update state_and_timers to a good example to demonstrate both state and timers. 552 // Rename this to bag_state and update the bag state example in the programming guide at that point. 553 // [START state_and_timers] 554 555 // bagStateFn only emits words that haven't been seen 556 type bagStateFn struct { 557 bag state.Bag[string] 558 } 559 560 func (s *bagStateFn) ProcessElement(p state.Provider, book string, word string, emitWords func(string)) error { 561 // Get all values we've written to this bag state in this window. 562 vals, ok, err := s.bag.Read(p) 563 if err != nil { 564 return err 565 } 566 if !ok || !contains(vals, word) { 567 emitWords(word) 568 s.bag.Add(p, word) 569 } 570 571 if len(vals) > 10000 { 572 // Example of clearing and starting again with an empty bag 573 s.bag.Clear(p) 574 } 575 576 return nil 577 } 578 579 // [END state_and_timers] 580 581 // [START value_state] 582 583 // valueStateFn keeps track of the number of elements seen. 584 type valueStateFn struct { 585 val state.Value[int] 586 } 587 588 func (s *valueStateFn) ProcessElement(p state.Provider, book string, word string, emitWords func(string)) error { 589 // Get the value stored in our state 590 val, ok, err := s.val.Read(p) 591 if err != nil { 592 return err 593 } 594 if !ok { 595 s.val.Write(p, 1) 596 } else { 597 s.val.Write(p, val+1) 598 } 599 600 if val > 10000 { 601 // Example of clearing and starting again with an empty bag 602 s.val.Clear(p) 603 } 604 605 return nil 606 } 607 608 // [END value_state] 609 610 type MyCustomType struct{} 611 612 func (m MyCustomType) Bytes() []byte { 613 return nil 614 } 615 616 func (m MyCustomType) FromBytes(_ []byte) MyCustomType { 617 return m 618 } 619 620 // [START value_state_coder] 621 622 type valueStateDoFn struct { 623 val state.Value[MyCustomType] 624 } 625 626 func encode(m MyCustomType) []byte { 627 return m.Bytes() 628 } 629 630 func decode(b []byte) MyCustomType { 631 return MyCustomType{}.FromBytes(b) 632 } 633 634 func init() { 635 beam.RegisterCoder(reflect.TypeOf((*MyCustomType)(nil)).Elem(), encode, decode) 636 } 637 638 // [END value_state_coder] 639 640 type combineFn struct{} 641 642 // [START combining_state] 643 644 // combiningStateFn keeps track of the number of elements seen. 645 type combiningStateFn struct { 646 // types are the types of the accumulator, input, and output respectively 647 val state.Combining[int, int, int] 648 } 649 650 func (s *combiningStateFn) ProcessElement(p state.Provider, book string, word string, emitWords func(string)) error { 651 // Get the value stored in our state 652 val, _, err := s.val.Read(p) 653 if err != nil { 654 return err 655 } 656 s.val.Add(p, 1) 657 658 if val > 10000 { 659 // Example of clearing and starting again with an empty bag 660 s.val.Clear(p) 661 } 662 663 return nil 664 } 665 666 func main() { 667 // ... 668 // CombineFn param can be a simple fn like this or a structural CombineFn 669 cFn := state.MakeCombiningState[int, int, int]("stateKey", func(a, b int) int { 670 return a + b 671 }) 672 // ... 673 674 // [END combining_state] 675 676 fmt.Print(cFn) 677 } 678 679 type statefulDoFn struct { 680 s state.Value[int] 681 } 682 683 func statefulPipeline() beam.PCollection { 684 var s beam.Scope 685 var elements beam.PCollection 686 687 // [START windowed_state] 688 689 items := beam.ParDo(s, statefulDoFn{}, elements) 690 out := beam.WindowInto(s, window.NewFixedWindows(24*time.Hour), items) 691 692 // [END windowed_state] 693 694 return out 695 } 696 697 func init() { 698 register.Function3x0(extractWordsFn) 699 // 1 input of type string => Emitter1[string] 700 register.Emitter1[string]() 701 } 702 703 // [START countwords_composite] 704 // CountWords is a function that builds a composite PTransform 705 // to count the number of times each word appears. 706 func CountWords(s beam.Scope, lines beam.PCollection) beam.PCollection { 707 // A subscope is required for a function to become a composite transform. 708 // We assign it to the original scope variable s to shadow the original 709 // for the rest of the CountWords function. 710 s = s.Scope("CountWords") 711 712 // Since the same subscope is used for the following transforms, 713 // they are in the same composite PTransform. 714 715 // Convert lines of text into individual words. 716 words := beam.ParDo(s, extractWordsFn, lines) 717 718 // Count the number of times each word occurs. 719 wordCounts := stats.Count(s, words) 720 721 // Return any PCollections that should be available after 722 // the composite transform. 723 return wordCounts 724 } 725 726 // [END countwords_composite]