github.com/grailbio/bigslice@v0.0.0-20230519005545-30c4c12152ad/reduce.go (about)

     1  // Copyright 2018 GRAIL, Inc. All rights reserved.
     2  // Use of this source code is governed by the Apache 2.0
     3  // license that can be found in the LICENSE file.
     4  
     5  package bigslice
     6  
     7  import (
     8  	"fmt"
     9  	"strings"
    10  
    11  	"github.com/grailbio/bigslice/frame"
    12  	"github.com/grailbio/bigslice/slicefunc"
    13  	"github.com/grailbio/bigslice/sliceio"
    14  	"github.com/grailbio/bigslice/slicetype"
    15  	"github.com/grailbio/bigslice/sortio"
    16  	"github.com/grailbio/bigslice/typecheck"
    17  )
    18  
    19  // Reduce returns a slice that reduces elements pairwise. Reduce
    20  // operations must be commutative and associative. Schematically:
    21  //
    22  //	Reduce(Slice<k, v>, func(v1, v2 v) v) Slice<k, v>
    23  //
    24  // The provided reducer function is invoked to aggregate values of
    25  // type v. Reduce can perform map-side "combining", so that data are
    26  // reduced to their aggregated value aggressively. This can often
    27  // speed up computations significantly.
    28  //
    29  // The slice to be reduced must have exactly 1 residual column: that is,
    30  // its prefix must leave just one column as the value column to be
    31  // aggregated.
    32  //
    33  // TODO(marius): Reduce currently maintains the working set of keys
    34  // in memory, and is thus appropriate only where the working set can
    35  // fit in memory. For situations where this is not the case, Cogroup
    36  // should be used instead (at an overhead). Reduce should spill to disk
    37  // when necessary.
    38  //
    39  // TODO(marius): consider pushing combiners into task dependency
    40  // definitions so that we can combine-read all partitions on one machine
    41  // simultaneously.
    42  func Reduce(slice Slice, reduce interface{}) Slice {
    43  	if res := slice.NumOut() - slice.Prefix(); res != 1 {
    44  		typecheck.Panicf(1, "the slice must only have one 1 residual column; has %d", res)
    45  	}
    46  	if err := canMakeCombiningFrame(slice); err != nil {
    47  		typecheck.Panic(1, err.Error())
    48  	}
    49  	fn, ok := slicefunc.Of(reduce)
    50  	if !ok {
    51  		typecheck.Panicf(1, "reduce: invalid reduce function %T", reduce)
    52  	}
    53  	outputType := slice.Out(slice.NumOut() - 1)
    54  	if fn.In.NumOut() != 2 || fn.In.Out(0) != outputType || fn.In.Out(1) != outputType ||
    55  		fn.Out.NumOut() != 1 || fn.Out.Out(0) != outputType {
    56  		typecheck.Panicf(1, "reduce: invalid reduce function %T, expected func(%s, %s) %s", reduce, outputType, outputType, outputType)
    57  	}
    58  	return &reduceSlice{slice, MakeName("reduce"), fn}
    59  }
    60  
    61  // ReduceSlice implements "post shuffle" combining merge sort.
    62  type reduceSlice struct {
    63  	Slice
    64  	name     Name
    65  	combiner slicefunc.Func
    66  }
    67  
    68  func (r *reduceSlice) Name() Name               { return r.name }
    69  func (*reduceSlice) NumDep() int                { return 1 }
    70  func (r *reduceSlice) Dep(i int) Dep            { return Dep{r.Slice, true, nil, true} }
    71  func (r *reduceSlice) Combiner() slicefunc.Func { return r.combiner }
    72  
    73  func (r *reduceSlice) Reader(shard int, deps []sliceio.Reader) sliceio.Reader {
    74  	if len(deps) == 1 {
    75  		return deps[0]
    76  	}
    77  	return sortio.Reduce(r, fmt.Sprintf("app-%d", shard), deps, r.combiner)
    78  }
    79  
    80  // CanMakeCombiningFrame tells whether the provided Frame type can be
    81  // be made into a combining frame.
    82  // Returns an error if types cannot be combined.
    83  func canMakeCombiningFrame(typ slicetype.Type) error {
    84  	var failingTypes []string
    85  	for i := 0; i < typ.Prefix(); i++ {
    86  		if !frame.CanHash(typ.Out(i)) || !frame.CanCompare(typ.Out(i)) {
    87  			failingTypes = append(failingTypes, typ.Out(i).String())
    88  		}
    89  	}
    90  	if len(failingTypes) == 0 {
    91  		return nil
    92  	}
    93  	return fmt.Errorf("cannot combine values for keys of type: %s", strings.Join(failingTypes, ", "))
    94  }