github.com/apache/beam/sdks/v2@v2.48.2/go/examples/kafka/taxi.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one or more 2 // contributor license agreements. See the NOTICE file distributed with 3 // this work for additional information regarding copyright ownership. 4 // The ASF licenses this file to You under the Apache License, Version 2.0 5 // (the "License"); you may not use this file except in compliance with 6 // the License. You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 // taxi is an example using a cross-language Kafka pipeline to write and read 17 // to Kafka. This example reads from the PubSub NYC Taxi stream described in 18 // https://github.com/googlecodelabs/cloud-dataflow-nyc-taxi-tycoon, writes to 19 // a given Kafka topic and then reads back from the same Kafka topic, logging 20 // every element. This is done as a streaming pipeline and will not end 21 // unless the pipeline is stopped externally. 22 // 23 // Running this example requires a Kafka cluster accessible to the runner, and 24 // a cross-language expansion service that can expand Kafka read and write 25 // transforms. An address to a persistent expansion service can be provided as 26 // a flag, or if none is specified then the SDK will attempt to automatically 27 // start an appropriate expansion service. 28 // 29 // # Setting Up a Kafka Cluster 30 // 31 // Setting up a Kafka cluster is more involved than can be covered in this 32 // example. In order for this example to work, all that is necessary is a Kafka 33 // cluster accessible through a bootstrap server address that is passed in as 34 // a flag. Some instructions for setting up a single node Kafka cluster in GCE 35 // can be found here: https://github.com/GoogleCloudPlatform/java-docs-samples/tree/master/dataflow/flex-templates/kafka_to_bigquery 36 // 37 // # Running an Expansion Server 38 // 39 // If the automatic expansion service functionality is not available for your 40 // environment, or if you want improved performance, you will need to start a 41 // persistent expansion service. These instructions will cover running the Java 42 // IO Expansion Service, and therefore requires a JDK installation in a version 43 // supported by Beam. Depending on whether you are running this from a numbered 44 // Beam release, or a development environment, there are two sources you may 45 // use for the Expansion service. 46 // 47 // Numbered release: The expansion service jar is vendored as module 48 // org.apache.beam:beam-sdks-java-io-expansion-service in Maven Repository. 49 // This jar can be executed directly with the following command: 50 // 51 // `java -jar <jar_name> <port_number>` 52 // 53 // Development env: This requires that the JAVA_HOME environment variable 54 // points to your JDK installation. From the root `beam/` directory of the 55 // Apache Beam repository, the jar can be built (or built and run) with the 56 // following commands: 57 // 58 // Build: ./gradlew :sdks:java:io:expansion-service:build 59 // Build and Run: ./gradlew :sdks:java:io:expansion-service:runExpansionService -PconstructionService.port=<port_num> 60 // 61 // # Running the Example on GCP 62 // 63 // Running this pipeline requires providing an address for the Expansion Service 64 // and for the Kafka cluster's bootstrap servers as flags, in addition to the 65 // usual flags for pipelines. 66 // 67 // An example command for executing this pipeline on GCP is as follows: 68 // 69 // export PROJECT="$(gcloud config get-value project)" 70 // export TEMP_LOCATION="gs://MY-BUCKET/temp" 71 // export REGION="us-central1" 72 // export JOB_NAME="kafka-taxi-`date +%Y%m%d-%H%M%S`" 73 // export BOOTSTRAP_SERVERS="123.45.67.89:1234" 74 // export EXPANSION_ADDR="localhost:1234" 75 // cd ./sdks/go 76 // go run ./examples/kafka/taxi.go \ 77 // --runner=DataflowRunner \ 78 // --temp_location=$TEMP_LOCATION \ 79 // --staging_location=$STAGING_LOCATION \ 80 // --project=$PROJECT \ 81 // --region=$REGION \ 82 // --job_name="${JOB_NAME}" \ 83 // --bootstrap_servers=$BOOTSTRAP_SERVER \ 84 // --expansion_addr=$EXPANSION_ADDR 85 // 86 // # Running the Example From a Git Clone 87 // 88 // When running on a development environment, a custom container will likely 89 // need to be provided for the cross-language SDK. First this will require 90 // building and pushing the SDK container to container repository, such as 91 // Docker Hub. 92 // 93 // export DOCKER_ROOT="Your Docker Repository Root" 94 // ./gradlew :sdks:java:container:java8:docker -Pdocker-repository-root=$DOCKER_ROOT -Pdocker-tag=latest 95 // docker push $DOCKER_ROOT/beam_java8_sdk:latest 96 // 97 // For runners in local mode, simply building the container using the default 98 // values for docker-repository-root and docker-tag will work to have it 99 // accessible locally. 100 // 101 // Additionally, you must provide the location of your custom container to the 102 // pipeline with the --sdk_harness_container_image_override flag for Java, or 103 // --environment_config flag for Go. For example: 104 // 105 // --sdk_harness_container_image_override=".*java.*,${DOCKER_ROOT}/beam_java8_sdk:latest" \ 106 // --environment_config=${DOCKER_ROOT}/beam_go_sdk:latest 107 package main 108 109 import ( 110 "context" 111 "flag" 112 "time" 113 114 "github.com/apache/beam/sdks/v2/go/pkg/beam" 115 "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/window" 116 "github.com/apache/beam/sdks/v2/go/pkg/beam/io/pubsubio" 117 "github.com/apache/beam/sdks/v2/go/pkg/beam/io/xlang/kafkaio" 118 "github.com/apache/beam/sdks/v2/go/pkg/beam/log" 119 "github.com/apache/beam/sdks/v2/go/pkg/beam/register" 120 "github.com/apache/beam/sdks/v2/go/pkg/beam/x/beamx" 121 ) 122 123 var ( 124 expansionAddr = flag.String("expansion_addr", "", 125 "Address of Expansion Service. If not specified, attempts to automatically start an appropriate expansion service.") 126 bootstrapServers = flag.String("bootstrap_servers", "", 127 "(Required) URL of the bootstrap servers for the Kafka cluster. Should be accessible by the runner.") 128 topic = flag.String("topic", "kafka_taxirides_realtime", "Kafka topic to write to and read from.") 129 ) 130 131 func init() { 132 register.DoFn2x0[context.Context, []byte](&LogFn{}) 133 } 134 135 // LogFn is a DoFn to log rides. 136 type LogFn struct{} 137 138 // ProcessElement logs each element it receives. 139 func (fn *LogFn) ProcessElement(ctx context.Context, elm []byte) { 140 log.Infof(ctx, "Ride info: %v", string(elm)) 141 } 142 143 // FinishBundle waits a bit so the job server finishes receiving logs. 144 func (fn *LogFn) FinishBundle() { 145 time.Sleep(2 * time.Second) 146 } 147 148 func main() { 149 flag.Parse() 150 beam.Init() 151 152 ctx := context.Background() 153 154 p := beam.NewPipeline() 155 s := p.Root() 156 157 // Read from Pubsub and write to Kafka. 158 data := pubsubio.Read(s, "pubsub-public-data", "taxirides-realtime", nil) 159 kvData := beam.ParDo(s, func(elm []byte) ([]byte, []byte) { return []byte(""), elm }, data) 160 windowed := beam.WindowInto(s, window.NewFixedWindows(15*time.Second), kvData) 161 kafkaio.Write(s, *expansionAddr, *bootstrapServers, *topic, windowed) 162 163 // Simultaneously read from Kafka and log any element received. 164 read := kafkaio.Read(s, *expansionAddr, *bootstrapServers, []string{*topic}) 165 vals := beam.DropKey(s, read) 166 beam.ParDo0(s, &LogFn{}, vals) 167 168 if err := beamx.Run(ctx, p); err != nil { 169 log.Fatalf(ctx, "Failed to execute job: %v", err) 170 } 171 }