github.com/grailbio/bigslice@v0.0.0-20230519005545-30c4c12152ad/reduce.go (about) 1 // Copyright 2018 GRAIL, Inc. All rights reserved. 2 // Use of this source code is governed by the Apache 2.0 3 // license that can be found in the LICENSE file. 4 5 package bigslice 6 7 import ( 8 "fmt" 9 "strings" 10 11 "github.com/grailbio/bigslice/frame" 12 "github.com/grailbio/bigslice/slicefunc" 13 "github.com/grailbio/bigslice/sliceio" 14 "github.com/grailbio/bigslice/slicetype" 15 "github.com/grailbio/bigslice/sortio" 16 "github.com/grailbio/bigslice/typecheck" 17 ) 18 19 // Reduce returns a slice that reduces elements pairwise. Reduce 20 // operations must be commutative and associative. Schematically: 21 // 22 // Reduce(Slice<k, v>, func(v1, v2 v) v) Slice<k, v> 23 // 24 // The provided reducer function is invoked to aggregate values of 25 // type v. Reduce can perform map-side "combining", so that data are 26 // reduced to their aggregated value aggressively. This can often 27 // speed up computations significantly. 28 // 29 // The slice to be reduced must have exactly 1 residual column: that is, 30 // its prefix must leave just one column as the value column to be 31 // aggregated. 32 // 33 // TODO(marius): Reduce currently maintains the working set of keys 34 // in memory, and is thus appropriate only where the working set can 35 // fit in memory. For situations where this is not the case, Cogroup 36 // should be used instead (at an overhead). Reduce should spill to disk 37 // when necessary. 38 // 39 // TODO(marius): consider pushing combiners into task dependency 40 // definitions so that we can combine-read all partitions on one machine 41 // simultaneously. 42 func Reduce(slice Slice, reduce interface{}) Slice { 43 if res := slice.NumOut() - slice.Prefix(); res != 1 { 44 typecheck.Panicf(1, "the slice must only have one 1 residual column; has %d", res) 45 } 46 if err := canMakeCombiningFrame(slice); err != nil { 47 typecheck.Panic(1, err.Error()) 48 } 49 fn, ok := slicefunc.Of(reduce) 50 if !ok { 51 typecheck.Panicf(1, "reduce: invalid reduce function %T", reduce) 52 } 53 outputType := slice.Out(slice.NumOut() - 1) 54 if fn.In.NumOut() != 2 || fn.In.Out(0) != outputType || fn.In.Out(1) != outputType || 55 fn.Out.NumOut() != 1 || fn.Out.Out(0) != outputType { 56 typecheck.Panicf(1, "reduce: invalid reduce function %T, expected func(%s, %s) %s", reduce, outputType, outputType, outputType) 57 } 58 return &reduceSlice{slice, MakeName("reduce"), fn} 59 } 60 61 // ReduceSlice implements "post shuffle" combining merge sort. 62 type reduceSlice struct { 63 Slice 64 name Name 65 combiner slicefunc.Func 66 } 67 68 func (r *reduceSlice) Name() Name { return r.name } 69 func (*reduceSlice) NumDep() int { return 1 } 70 func (r *reduceSlice) Dep(i int) Dep { return Dep{r.Slice, true, nil, true} } 71 func (r *reduceSlice) Combiner() slicefunc.Func { return r.combiner } 72 73 func (r *reduceSlice) Reader(shard int, deps []sliceio.Reader) sliceio.Reader { 74 if len(deps) == 1 { 75 return deps[0] 76 } 77 return sortio.Reduce(r, fmt.Sprintf("app-%d", shard), deps, r.combiner) 78 } 79 80 // CanMakeCombiningFrame tells whether the provided Frame type can be 81 // be made into a combining frame. 82 // Returns an error if types cannot be combined. 83 func canMakeCombiningFrame(typ slicetype.Type) error { 84 var failingTypes []string 85 for i := 0; i < typ.Prefix(); i++ { 86 if !frame.CanHash(typ.Out(i)) || !frame.CanCompare(typ.Out(i)) { 87 failingTypes = append(failingTypes, typ.Out(i).String()) 88 } 89 } 90 if len(failingTypes) == 0 { 91 return nil 92 } 93 return fmt.Errorf("cannot combine values for keys of type: %s", strings.Join(failingTypes, ", ")) 94 }