github.com/apache/beam/sdks/v2@v2.48.2/go/examples/kafka/taxi.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one or more
     2  // contributor license agreements.  See the NOTICE file distributed with
     3  // this work for additional information regarding copyright ownership.
     4  // The ASF licenses this file to You under the Apache License, Version 2.0
     5  // (the "License"); you may not use this file except in compliance with
     6  // the License.  You may obtain a copy of the License at
     7  //
     8  //    http://www.apache.org/licenses/LICENSE-2.0
     9  //
    10  // Unless required by applicable law or agreed to in writing, software
    11  // distributed under the License is distributed on an "AS IS" BASIS,
    12  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  // See the License for the specific language governing permissions and
    14  // limitations under the License.
    15  
    16  // taxi is an example using a cross-language Kafka pipeline to write and read
    17  // to Kafka. This example reads from the PubSub NYC Taxi stream described in
    18  // https://github.com/googlecodelabs/cloud-dataflow-nyc-taxi-tycoon, writes to
    19  // a given Kafka topic and then reads back from the same Kafka topic, logging
    20  // every element. This is done as a streaming pipeline and will not end
    21  // unless the pipeline is stopped externally.
    22  //
    23  // Running this example requires a Kafka cluster accessible to the runner, and
    24  // a cross-language expansion service that can expand Kafka read and write
    25  // transforms. An address to a persistent expansion service can be provided as
    26  // a flag, or if none is specified then the SDK will attempt to automatically
    27  // start an appropriate expansion service.
    28  //
    29  // # Setting Up a Kafka Cluster
    30  //
    31  // Setting up a Kafka cluster is more involved than can be covered in this
    32  // example. In order for this example to work, all that is necessary is a Kafka
    33  // cluster accessible through a bootstrap server address that is passed in as
    34  // a flag. Some instructions for setting up a single node Kafka cluster in GCE
    35  // can be found here: https://github.com/GoogleCloudPlatform/java-docs-samples/tree/master/dataflow/flex-templates/kafka_to_bigquery
    36  //
    37  // # Running an Expansion Server
    38  //
    39  // If the automatic expansion service functionality is not available for your
    40  // environment, or if you want improved performance, you will need to start a
    41  // persistent expansion service. These instructions will cover running the Java
    42  // IO Expansion Service, and therefore requires a JDK installation in a version
    43  // supported by Beam. Depending on whether you are running this from a numbered
    44  // Beam release, or a development environment, there are two sources you may
    45  // use for the Expansion service.
    46  //
    47  // Numbered release: The expansion service jar is vendored as module
    48  // org.apache.beam:beam-sdks-java-io-expansion-service in Maven Repository.
    49  // This jar can be executed directly with the following command:
    50  //
    51  //	`java -jar <jar_name> <port_number>`
    52  //
    53  // Development env: This requires that the JAVA_HOME environment variable
    54  // points to your JDK installation. From the root `beam/` directory of the
    55  // Apache Beam repository, the jar can be built (or built and run) with the
    56  // following commands:
    57  //
    58  //	Build: ./gradlew :sdks:java:io:expansion-service:build
    59  //	Build and Run: ./gradlew :sdks:java:io:expansion-service:runExpansionService -PconstructionService.port=<port_num>
    60  //
    61  // # Running the Example on GCP
    62  //
    63  // Running this pipeline requires providing an address for the Expansion Service
    64  // and for the Kafka cluster's bootstrap servers as flags, in addition to the
    65  // usual flags for pipelines.
    66  //
    67  // An example command for executing this pipeline on GCP is as follows:
    68  //
    69  //	export PROJECT="$(gcloud config get-value project)"
    70  //	export TEMP_LOCATION="gs://MY-BUCKET/temp"
    71  //	export REGION="us-central1"
    72  //	export JOB_NAME="kafka-taxi-`date +%Y%m%d-%H%M%S`"
    73  //	export BOOTSTRAP_SERVERS="123.45.67.89:1234"
    74  //	export EXPANSION_ADDR="localhost:1234"
    75  //	cd ./sdks/go
    76  //	go run ./examples/kafka/taxi.go \
    77  //	  --runner=DataflowRunner \
    78  //	  --temp_location=$TEMP_LOCATION \
    79  //	  --staging_location=$STAGING_LOCATION \
    80  //	  --project=$PROJECT \
    81  //	  --region=$REGION \
    82  //	  --job_name="${JOB_NAME}" \
    83  //	  --bootstrap_servers=$BOOTSTRAP_SERVER \
    84  //	  --expansion_addr=$EXPANSION_ADDR
    85  //
    86  // # Running the Example From a Git Clone
    87  //
    88  // When running on a development environment, a custom container will likely
    89  // need to be provided for the cross-language SDK. First this will require
    90  // building and pushing the SDK container to container repository, such as
    91  // Docker Hub.
    92  //
    93  //	export DOCKER_ROOT="Your Docker Repository Root"
    94  //	./gradlew :sdks:java:container:java8:docker -Pdocker-repository-root=$DOCKER_ROOT -Pdocker-tag=latest
    95  //	docker push $DOCKER_ROOT/beam_java8_sdk:latest
    96  //
    97  // For runners in local mode, simply building the container using the default
    98  // values for docker-repository-root and docker-tag will work to have it
    99  // accessible locally.
   100  //
   101  // Additionally, you must provide the location of your custom container to the
   102  // pipeline with the --sdk_harness_container_image_override flag for Java, or
   103  // --environment_config flag for Go. For example:
   104  //
   105  //	--sdk_harness_container_image_override=".*java.*,${DOCKER_ROOT}/beam_java8_sdk:latest" \
   106  //	--environment_config=${DOCKER_ROOT}/beam_go_sdk:latest
   107  package main
   108  
   109  import (
   110  	"context"
   111  	"flag"
   112  	"time"
   113  
   114  	"github.com/apache/beam/sdks/v2/go/pkg/beam"
   115  	"github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/window"
   116  	"github.com/apache/beam/sdks/v2/go/pkg/beam/io/pubsubio"
   117  	"github.com/apache/beam/sdks/v2/go/pkg/beam/io/xlang/kafkaio"
   118  	"github.com/apache/beam/sdks/v2/go/pkg/beam/log"
   119  	"github.com/apache/beam/sdks/v2/go/pkg/beam/register"
   120  	"github.com/apache/beam/sdks/v2/go/pkg/beam/x/beamx"
   121  )
   122  
   123  var (
   124  	expansionAddr = flag.String("expansion_addr", "",
   125  		"Address of Expansion Service. If not specified, attempts to automatically start an appropriate expansion service.")
   126  	bootstrapServers = flag.String("bootstrap_servers", "",
   127  		"(Required) URL of the bootstrap servers for the Kafka cluster. Should be accessible by the runner.")
   128  	topic = flag.String("topic", "kafka_taxirides_realtime", "Kafka topic to write to and read from.")
   129  )
   130  
   131  func init() {
   132  	register.DoFn2x0[context.Context, []byte](&LogFn{})
   133  }
   134  
   135  // LogFn is a DoFn to log rides.
   136  type LogFn struct{}
   137  
   138  // ProcessElement logs each element it receives.
   139  func (fn *LogFn) ProcessElement(ctx context.Context, elm []byte) {
   140  	log.Infof(ctx, "Ride info: %v", string(elm))
   141  }
   142  
   143  // FinishBundle waits a bit so the job server finishes receiving logs.
   144  func (fn *LogFn) FinishBundle() {
   145  	time.Sleep(2 * time.Second)
   146  }
   147  
   148  func main() {
   149  	flag.Parse()
   150  	beam.Init()
   151  
   152  	ctx := context.Background()
   153  
   154  	p := beam.NewPipeline()
   155  	s := p.Root()
   156  
   157  	// Read from Pubsub and write to Kafka.
   158  	data := pubsubio.Read(s, "pubsub-public-data", "taxirides-realtime", nil)
   159  	kvData := beam.ParDo(s, func(elm []byte) ([]byte, []byte) { return []byte(""), elm }, data)
   160  	windowed := beam.WindowInto(s, window.NewFixedWindows(15*time.Second), kvData)
   161  	kafkaio.Write(s, *expansionAddr, *bootstrapServers, *topic, windowed)
   162  
   163  	// Simultaneously read from Kafka and log any element received.
   164  	read := kafkaio.Read(s, *expansionAddr, *bootstrapServers, []string{*topic})
   165  	vals := beam.DropKey(s, read)
   166  	beam.ParDo0(s, &LogFn{}, vals)
   167  
   168  	if err := beamx.Run(ctx, p); err != nil {
   169  		log.Fatalf(ctx, "Failed to execute job: %v", err)
   170  	}
   171  }