kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/go/serving/pipeline/beamio/shards.go (about)

     1  /*
     2   * Copyright 2021 The Kythe Authors. All rights reserved.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *   http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package beamio
    18  
    19  import (
    20  	"bytes"
    21  	"reflect"
    22  	"sort"
    23  
    24  	"github.com/apache/beam/sdks/go/pkg/beam"
    25  	"github.com/apache/beam/sdks/go/pkg/beam/core/util/reflectx"
    26  	"github.com/apache/beam/sdks/go/pkg/beam/transforms/stats"
    27  )
    28  
    29  func init() {
    30  	beam.RegisterFunction(computeShard)
    31  	beam.RegisterFunction(assignKeyWeights)
    32  	beam.RegisterType(reflect.TypeOf((*computeMinKvSize)(nil)).Elem())
    33  	reflectx.RegisterFunc(reflect.ValueOf(bytesLessFn).Type(), func(_ any) reflectx.Func {
    34  		return newBytesLess()
    35  	})
    36  }
    37  
    38  // Not used, needed for the signature.
    39  func bytesLessFn(a, b []byte) bool {
    40  	return bytes.Compare(a, b) < 0
    41  }
    42  
    43  type bytesLess struct {
    44  	name string
    45  	t    reflect.Type
    46  }
    47  
    48  func newBytesLess() *bytesLess {
    49  	return &bytesLess{
    50  		name: reflectx.FunctionName(reflect.ValueOf(bytesLessFn).Interface()),
    51  		t:    reflect.ValueOf(bytesLessFn).Type(),
    52  	}
    53  }
    54  
    55  func (i *bytesLess) Name() string {
    56  	return i.name
    57  }
    58  func (i *bytesLess) Type() reflect.Type {
    59  	return i.t
    60  }
    61  func (i *bytesLess) Call(args []any) []any {
    62  	return []any{bytesLessFn(args[0].([]byte), args[1].([]byte))}
    63  }
    64  
    65  // Input is PCollection of beamio.KeyValue<[]byte, []byte>, output is PCollection of *ppb.KeyWeights.
    66  func computeAndAssignKeyWeights(s beam.Scope, kv beam.PCollection, shards int) beam.PCollection {
    67  	minKvSize := beam.Combine(s, computeMinKvSize{}, kv)
    68  	return beam.ParDo(s, assignKeyWeights, kv, beam.SideInput{Input: minKvSize})
    69  }
    70  
    71  // ComputeShards assigns shards to KeyValues. Input is a PCollection of beamio.KeyValue, output is (int, beamio.KeyValue).
    72  func ComputeShards(s beam.Scope, kv beam.PCollection, opts stats.Opts) beam.PCollection {
    73  	// Allows us to "weight" keys by how big their values are. This helps us estimate quantiles with similar sizes of data
    74  	weightedKeys := computeAndAssignKeyWeights(s, kv, opts.NumQuantiles)
    75  	ssp := stats.ApproximateWeightedQuantiles(s, weightedKeys, bytesLessFn, opts)
    76  	return beam.ParDo(s, computeShard, kv, beam.SideInput{Input: ssp})
    77  }
    78  
    79  type computeMinKvSize struct{}
    80  
    81  func (computeMinKvSize) AddInput(accum int, input KeyValue) int {
    82  	inputSize := len(input.Value) + len(input.Key)
    83  	if accum < inputSize && accum != 0 {
    84  		return accum
    85  	}
    86  	return inputSize
    87  }
    88  
    89  func (computeMinKvSize) MergeAccumulators(accum, other int) int {
    90  	if accum < other {
    91  		return accum
    92  	}
    93  	return other
    94  }
    95  
    96  func computeShard(kv KeyValue, ssp [][]byte) (int, KeyValue) {
    97  	shard := sort.Search(len(ssp), func(i int) bool { return bytes.Compare(kv.Key, ssp[i]) < 0 })
    98  	return shard, kv
    99  }
   100  
   101  func assignKeyWeights(kv KeyValue, minKvSize int) (int, []byte) {
   102  	kvSize := len(kv.Key) + len(kv.Value)
   103  	weight := kvSize / minKvSize
   104  	return weight, kv.Key
   105  }