kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/go/serving/pipeline/beamio/shards.go (about) 1 /* 2 * Copyright 2021 The Kythe Authors. All rights reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package beamio 18 19 import ( 20 "bytes" 21 "reflect" 22 "sort" 23 24 "github.com/apache/beam/sdks/go/pkg/beam" 25 "github.com/apache/beam/sdks/go/pkg/beam/core/util/reflectx" 26 "github.com/apache/beam/sdks/go/pkg/beam/transforms/stats" 27 ) 28 29 func init() { 30 beam.RegisterFunction(computeShard) 31 beam.RegisterFunction(assignKeyWeights) 32 beam.RegisterType(reflect.TypeOf((*computeMinKvSize)(nil)).Elem()) 33 reflectx.RegisterFunc(reflect.ValueOf(bytesLessFn).Type(), func(_ any) reflectx.Func { 34 return newBytesLess() 35 }) 36 } 37 38 // Not used, needed for the signature. 39 func bytesLessFn(a, b []byte) bool { 40 return bytes.Compare(a, b) < 0 41 } 42 43 type bytesLess struct { 44 name string 45 t reflect.Type 46 } 47 48 func newBytesLess() *bytesLess { 49 return &bytesLess{ 50 name: reflectx.FunctionName(reflect.ValueOf(bytesLessFn).Interface()), 51 t: reflect.ValueOf(bytesLessFn).Type(), 52 } 53 } 54 55 func (i *bytesLess) Name() string { 56 return i.name 57 } 58 func (i *bytesLess) Type() reflect.Type { 59 return i.t 60 } 61 func (i *bytesLess) Call(args []any) []any { 62 return []any{bytesLessFn(args[0].([]byte), args[1].([]byte))} 63 } 64 65 // Input is PCollection of beamio.KeyValue<[]byte, []byte>, output is PCollection of *ppb.KeyWeights. 66 func computeAndAssignKeyWeights(s beam.Scope, kv beam.PCollection, shards int) beam.PCollection { 67 minKvSize := beam.Combine(s, computeMinKvSize{}, kv) 68 return beam.ParDo(s, assignKeyWeights, kv, beam.SideInput{Input: minKvSize}) 69 } 70 71 // ComputeShards assigns shards to KeyValues. Input is a PCollection of beamio.KeyValue, output is (int, beamio.KeyValue). 72 func ComputeShards(s beam.Scope, kv beam.PCollection, opts stats.Opts) beam.PCollection { 73 // Allows us to "weight" keys by how big their values are. This helps us estimate quantiles with similar sizes of data 74 weightedKeys := computeAndAssignKeyWeights(s, kv, opts.NumQuantiles) 75 ssp := stats.ApproximateWeightedQuantiles(s, weightedKeys, bytesLessFn, opts) 76 return beam.ParDo(s, computeShard, kv, beam.SideInput{Input: ssp}) 77 } 78 79 type computeMinKvSize struct{} 80 81 func (computeMinKvSize) AddInput(accum int, input KeyValue) int { 82 inputSize := len(input.Value) + len(input.Key) 83 if accum < inputSize && accum != 0 { 84 return accum 85 } 86 return inputSize 87 } 88 89 func (computeMinKvSize) MergeAccumulators(accum, other int) int { 90 if accum < other { 91 return accum 92 } 93 return other 94 } 95 96 func computeShard(kv KeyValue, ssp [][]byte) (int, KeyValue) { 97 shard := sort.Search(len(ssp), func(i int) bool { return bytes.Compare(kv.Key, ssp[i]) < 0 }) 98 return shard, kv 99 } 100 101 func assignKeyWeights(kv KeyValue, minKvSize int) (int, []byte) { 102 kvSize := len(kv.Key) + len(kv.Value) 103 weight := kvSize / minKvSize 104 return weight, kv.Key 105 }