github.com/grailbio/bigslice@v0.0.0-20230519005545-30c4c12152ad/slice.go

github.com/grailbio/bigslice@v0.0.0-20230519005545-30c4c12152ad/slice.go (about)

     1  // Copyright 2018 GRAIL, Inc. All rights reserved.
     2  // Use of this source code is governed by the Apache 2.0
     3  // license that can be found in the LICENSE file.
     4  
     5  package bigslice
     6  
     7  import (
     8  	"context"
     9  	"fmt"
    10  	"reflect"
    11  	"runtime"
    12  	"strings"
    13  	"sync"
    14  
    15  	"github.com/grailbio/base/errors"
    16  	"github.com/grailbio/base/log"
    17  	"github.com/grailbio/bigslice/frame"
    18  	"github.com/grailbio/bigslice/internal/defaultsize"
    19  	"github.com/grailbio/bigslice/slicefunc"
    20  	"github.com/grailbio/bigslice/sliceio"
    21  	"github.com/grailbio/bigslice/slicetype"
    22  	"github.com/grailbio/bigslice/typecheck"
    23  )
    24  
    25  var typeOfError = reflect.TypeOf((*error)(nil)).Elem()
    26  
    27  // DefaultChunkSize is the default size used for IO vectors throughout bigslice.
    28  var defaultChunksize = defaultsize.Chunk
    29  
    30  var errTypeError = errors.New("type error")
    31  
    32  // A Dep is a Slice dependency. Deps comprise a slice and a boolean flag
    33  // determining whether this is represents a shuffle dependency. Shuffle
    34  // dependencies must perform a data shuffle step: the dependency must partition
    35  // its output according to the Slice's partitioner, and, when the dependent
    36  // Slice is computed, the evaluator must pass in Readers that read a single
    37  // partition from all dependent shards. If Shuffle is true, then the provided
    38  // partitioner determines how the output is partitioned. If it is nil, the
    39  // default (hash by first column) partitioner is used.
    40  type Dep struct {
    41  	Slice
    42  	Shuffle     bool
    43  	Partitioner Partitioner
    44  	// Expand indicates that each shard of a shuffle dependency (i.e.,
    45  	// all the shards of a given partition) should be expanded (i.e.,
    46  	// not merged) when handed to the slice implementation. This is to
    47  	// support merge-sorting of shards of the same partition.
    48  	Expand bool
    49  }
    50  
    51  // ShardType indicates the type of sharding used by a Slice.
    52  type ShardType int
    53  
    54  const (
    55  	// HashShard Slices are partitioned by an (unspecified)
    56  	// hash of an record. That is, the same record should
    57  	// be assigned a stable shard number.
    58  	HashShard ShardType = iota
    59  	// RangeShard Slices are partitioned by the range of a key. The key
    60  	// is always the first column of the slice.
    61  	RangeShard
    62  )
    63  
    64  // A Partitioner is used to assign partitions to rows in a frame.
    65  type Partitioner func(ctx context.Context, frame frame.Frame, nshard int, shards []int)
    66  
    67  // A Slice is a shardable, ordered dataset. Each slice consists of zero or more
    68  // columns of data distributed over one or  more shards. Slices may declare
    69  // dependencies on other slices from which it is computed. In order to compute
    70  // a slice, its dependencies must first be computed, and their resulting
    71  // Readers are passed to a Slice's Reader method.
    72  //
    73  // Since Go does not support generic typing, Slice combinators perform their
    74  // own dynamic type checking. Schematically we write the n-ary slice with types
    75  // t1, t2, ..., tn as Slice<t1, t2, ..., tn>.
    76  //
    77  // Types that implement the Slice interface must be comparable.
    78  type Slice interface {
    79  	slicetype.Type
    80  
    81  	// Name returns a unique (composite) name for this Slice that also has
    82  	// useful context for diagnostic or status display.
    83  	Name() Name
    84  
    85  	// NumShard returns the number of shards in this Slice.
    86  	NumShard() int
    87  	// ShardType returns the sharding type of this Slice.
    88  	ShardType() ShardType
    89  
    90  	// NumDep returns the number of dependencies of this Slice.
    91  	NumDep() int
    92  	// Dep returns the i'th dependency for this Slice.
    93  	Dep(i int) Dep
    94  
    95  	// Combiner is an optional function that is used to combine multiple values
    96  	// with the same key from the slice's output. No combination is performed
    97  	// if Nil.
    98  	Combiner() slicefunc.Func
    99  
   100  	// Reader returns a Reader for a shard of this Slice. The reader itself
   101  	// computes the shard's values on demand. The caller must provide Readers
   102  	// for all of this shard's dependencies, constructed according to the
   103  	// dependency type (see Dep).
   104  	Reader(shard int, deps []sliceio.Reader) sliceio.Reader
   105  }
   106  
   107  // Pragma comprises runtime directives used during bigslice
   108  // execution.
   109  type Pragma interface {
   110  	// Procs returns the number of procs a slice task needs to run. It is
   111  	// superceded by Exclusive and clamped to the maximum number of procs per
   112  	// machine.
   113  	Procs() int
   114  	// Exclusive indicates that a slice task should be given
   115  	// exclusive access to the underlying machine.
   116  	Exclusive() bool
   117  	// Materialize indicates that the result of the slice task should be
   118  	// materialized, i.e. break pipelining.
   119  	Materialize() bool
   120  }
   121  
   122  // Pragmas composes multiple underlying Pragmas.
   123  type Pragmas []Pragma
   124  
   125  // Procs implements Pragma. If multiple tasks with Procs pragmas are pipelined,
   126  // we allocate the maximum to the composed pipeline.
   127  func (p Pragmas) Procs() int {
   128  	need := 1
   129  	for _, q := range p {
   130  		n := q.Procs()
   131  		if n > need {
   132  			need = n
   133  		}
   134  	}
   135  	return need
   136  }
   137  
   138  // Exclusive implements Pragma.
   139  func (p Pragmas) Exclusive() bool {
   140  	for _, q := range p {
   141  		if q.Exclusive() {
   142  			return true
   143  		}
   144  	}
   145  	return false
   146  }
   147  
   148  // Materialize implements Pragma.
   149  func (p Pragmas) Materialize() bool {
   150  	for _, q := range p {
   151  		if q.Materialize() {
   152  			return true
   153  		}
   154  	}
   155  	return false
   156  }
   157  
   158  type exclusive struct{}
   159  
   160  func (exclusive) Procs() int        { return 1 }
   161  func (exclusive) Exclusive() bool   { return true }
   162  func (exclusive) Materialize() bool { return false }
   163  
   164  // Exclusive is a Pragma that indicates the slice task should be given
   165  // exclusive access to the machine that runs it. Exclusive takes precedence
   166  // over Procs.
   167  var Exclusive Pragma = exclusive{}
   168  
   169  type materialize struct{}
   170  
   171  func (materialize) Procs() int        { return 1 }
   172  func (materialize) Exclusive() bool   { return false }
   173  func (materialize) Materialize() bool { return true }
   174  
   175  // ExperimentalMaterialize is a Pragma that indicates the slice task results
   176  // should be materialized, i.e. not pipelined. You may want to use this to
   177  // materialize and reuse results of tasks that would normally have been
   178  // pipelined.
   179  //
   180  // It is tagged "experimental" because we are considering other ways of
   181  // achieving this.
   182  //
   183  // TODO(jcharumilind): Consider doing this automatically for slices on which
   184  // multiple slices depend.
   185  var ExperimentalMaterialize Pragma = materialize{}
   186  
   187  type procs struct {
   188  	n int
   189  }
   190  
   191  func (p procs) Procs() int      { return p.n }
   192  func (procs) Exclusive() bool   { return false }
   193  func (procs) Materialize() bool { return false }
   194  
   195  // Procs returns a pragma that sets the number of procs a slice task needs to
   196  // run to n. It is superceded by Exclusive and clamped to the maximum number of
   197  // procs per machine.
   198  func Procs(n int) Pragma {
   199  	return procs{n: n}
   200  }
   201  
   202  type constSlice struct {
   203  	name Name
   204  	slicetype.Type
   205  	frame  frame.Frame
   206  	nshard int
   207  }
   208  
   209  // Const returns a Slice representing the provided value. Each column
   210  // of the Slice should be provided as a Go slice of the column's
   211  // type. The value is split into nshard shards.
   212  func Const(nshard int, columns ...interface{}) Slice {
   213  	if len(columns) == 0 {
   214  		typecheck.Panic(1, "const: must have at least one column")
   215  	}
   216  	s := new(constSlice)
   217  	s.name = MakeName("const")
   218  	s.nshard = nshard
   219  	if s.nshard < 1 {
   220  		typecheck.Panic(1, "const: shard must be >= 1")
   221  	}
   222  	var ok bool
   223  	s.Type, ok = typecheck.Slices(columns...)
   224  	if !ok {
   225  		typecheck.Panic(1, "const: invalid slice inputs")
   226  	}
   227  	// TODO(marius): convert panic to a typecheck panic
   228  	s.frame = frame.Slices(columns...)
   229  	return s
   230  }
   231  
   232  func (s *constSlice) Name() Name             { return s.name }
   233  func (*constSlice) Prefix() int              { return 1 }
   234  func (s *constSlice) NumShard() int          { return s.nshard }
   235  func (*constSlice) ShardType() ShardType     { return HashShard }
   236  func (*constSlice) NumDep() int              { return 0 }
   237  func (*constSlice) Dep(i int) Dep            { panic("no deps") }
   238  func (*constSlice) Combiner() slicefunc.Func { return slicefunc.Nil }
   239  
   240  type constReader struct {
   241  	op    *constSlice
   242  	frame frame.Frame
   243  	shard int
   244  }
   245  
   246  func (s *constReader) Read(ctx context.Context, out frame.Frame) (int, error) {
   247  	if !slicetype.Assignable(s.op, out) {
   248  		return 0, errTypeError
   249  	}
   250  	n := frame.Copy(out, s.frame)
   251  	m := s.frame.Len()
   252  	s.frame = s.frame.Slice(n, m)
   253  	if m == 0 {
   254  		return n, sliceio.EOF
   255  	}
   256  	return n, nil
   257  }
   258  
   259  // constShard computes the offset to and count of rows in the const data for a
   260  // given shard. n is the total number of rows in the data. nshard is the total
   261  // number of shards. constShard distributes data evenly. The difference in
   262  // count between one shard and another will be at most one.
   263  func constShard(n, nshard, shard int) (offset, count int) {
   264  	var (
   265  		quot = n / nshard
   266  		rem  = n % nshard
   267  	)
   268  	offset = quot * shard
   269  	count = quot
   270  	if shard < rem {
   271  		offset += shard
   272  		count++
   273  	} else {
   274  		offset += rem
   275  	}
   276  	return offset, count
   277  }
   278  
   279  func (s *constSlice) Reader(shard int, deps []sliceio.Reader) sliceio.Reader {
   280  	offset, count := constShard(s.frame.Len(), s.nshard, shard)
   281  	if count == 0 {
   282  		return sliceio.EmptyReader{}
   283  	}
   284  	r := &constReader{
   285  		op:    s,
   286  		frame: s.frame.Slice(offset, offset+count),
   287  		shard: shard,
   288  	}
   289  	return r
   290  }
   291  
   292  type readerFuncSlice struct {
   293  	name Name
   294  	Pragma
   295  	slicetype.Type
   296  	nshard    int
   297  	read      slicefunc.Func
   298  	stateType reflect.Type
   299  }
   300  
   301  // ReaderFunc returns a Slice that uses the provided function to read
   302  // data. The function read must be of the form:
   303  //
   304  //	func(shard int, state stateType, col1 []col1Type, col2 []col2Type, ..., colN []colNType) (int, error)
   305  //
   306  // This returns a slice of the form:
   307  //
   308  //	Slice<col1Type, col2Type, ..., colNType>
   309  //
   310  // The function is invoked to fill a vector of elements. col1, ...,
   311  // colN are preallocated slices that should be filled by the reader
   312  // function. The function should return the number of elements that
   313  // were filled. The error EOF should be returned when no more data
   314  // are available.
   315  //
   316  // ReaderFunc provides the function with a zero-value state upon the
   317  // first invocation of the function for a given shard. (If the state
   318  // argument is a pointer, it is allocated.) Subsequent invocations of
   319  // the function receive the same state value, thus permitting the
   320  // reader to maintain local state across the read of a whole shard.
   321  func ReaderFunc(nshard int, read interface{}, prags ...Pragma) Slice {
   322  	s := new(readerFuncSlice)
   323  	s.name = MakeName("reader")
   324  	s.nshard = nshard
   325  	fn, ok := slicefunc.Of(read)
   326  	if !ok || fn.In.NumOut() < 3 || fn.In.Out(0).Kind() != reflect.Int {
   327  		typecheck.Panicf(1, "readerfunc: invalid reader function type %T", read)
   328  	}
   329  	if fn.Out.Out(0).Kind() != reflect.Int || fn.Out.Out(1) != typeOfError {
   330  		typecheck.Panicf(1, "readerfunc: function %T does not return (int, error)", read)
   331  	}
   332  	s.stateType = fn.In.Out(1)
   333  	arg := slicetype.Slice(fn.In, 2, fn.In.NumOut())
   334  	if s.Type, ok = typecheck.Devectorize(arg); !ok {
   335  		typecheck.Panicf(1, "readerfunc: function %T is not vectorized", read)
   336  	}
   337  	s.read = fn
   338  	s.Pragma = Pragmas(prags)
   339  	return s
   340  }
   341  
   342  func (r *readerFuncSlice) Name() Name             { return r.name }
   343  func (*readerFuncSlice) Prefix() int              { return 1 }
   344  func (r *readerFuncSlice) NumShard() int          { return r.nshard }
   345  func (*readerFuncSlice) ShardType() ShardType     { return HashShard }
   346  func (*readerFuncSlice) NumDep() int              { return 0 }
   347  func (*readerFuncSlice) Dep(i int) Dep            { panic("no deps") }
   348  func (*readerFuncSlice) Combiner() slicefunc.Func { return slicefunc.Nil }
   349  
   350  type readerFuncSliceReader struct {
   351  	op    *readerFuncSlice
   352  	state reflect.Value
   353  	shard int
   354  	err   error
   355  
   356  	// consecutiveEmptyCalls counts how many times op.read returned 0 elements consecutively.
   357  	// Many empty calls may mean the user forgot to return sliceio.EOF, so we log a warning.
   358  	consecutiveEmptyCalls int
   359  }
   360  
   361  func (r *readerFuncSliceReader) Read(ctx context.Context, out frame.Frame) (n int, err error) {
   362  	if r.err != nil {
   363  		return 0, r.err
   364  	}
   365  	if !slicetype.Assignable(out, r.op) {
   366  		return 0, errTypeError
   367  	}
   368  	// Initialize state (on first call)
   369  	if !r.state.IsValid() {
   370  		if r.op.stateType.Kind() == reflect.Ptr {
   371  			r.state = reflect.New(r.op.stateType.Elem())
   372  		} else {
   373  			r.state = reflect.Zero(r.op.stateType)
   374  		}
   375  	}
   376  	// out is passed to a user, zero it.
   377  	out.Zero()
   378  	rvs := r.op.read.Call(ctx, append([]reflect.Value{reflect.ValueOf(r.shard), r.state}, out.Values()...))
   379  	n = int(rvs[0].Int())
   380  	if n == 0 {
   381  		r.consecutiveEmptyCalls++
   382  		if r.consecutiveEmptyCalls > 7 && r.consecutiveEmptyCalls&(r.consecutiveEmptyCalls-1) == 0 {
   383  			log.Printf("warning: reader func returned empty vector %d consecutive times; "+
   384  				"don't forget sliceio.EOF", r.consecutiveEmptyCalls)
   385  		}
   386  	} else {
   387  		r.consecutiveEmptyCalls = 0
   388  	}
   389  	if e := rvs[1].Interface(); e != nil {
   390  		if err := e.(error); err == sliceio.EOF || errors.IsTemporary(err) {
   391  			r.err = err
   392  		} else {
   393  			// We consider all application-generated errors as Fatal unless marked otherwise.
   394  			r.err = errors.E(errors.Fatal, err)
   395  		}
   396  	}
   397  	return n, r.err
   398  }
   399  
   400  func (r *readerFuncSlice) Reader(shard int, reader []sliceio.Reader) sliceio.Reader {
   401  	return &readerFuncSliceReader{op: r, shard: shard}
   402  }
   403  
   404  type writerFuncSlice struct {
   405  	name Name
   406  	Slice
   407  	stateType reflect.Type
   408  	write     slicefunc.Func
   409  }
   410  
   411  // WriterFunc returns a Slice that is functionally equivalent to the input
   412  // Slice, allowing for computation with side effects by the provided write
   413  // function. The write function must be of the form:
   414  //
   415  //	func(shard int, state stateType, err error, col1 []col1Type, col2 []col2Type, ..., colN []colNType) error
   416  //
   417  // where the input slice is of the form:
   418  //
   419  //	Slice<col1Type, col2Type, ..., colNType>
   420  //
   421  // The write function is invoked with every read of the input Slice. Each
   422  // column slice will be of the same length and will be populated with the data
   423  // from the read. For performance, the passed column slices share memory with
   424  // the internal frame of the read. Do not modify the data in them, and assume
   425  // that they will be modified once write returns.
   426  //
   427  // The write function should return a non-nil error if there is a problem
   428  // writing, e.g. the write function encounters and error while writing to a
   429  // file. It should otherwise return nil.
   430  //
   431  // Any error from the read, including EOF, will be passed as err to the write
   432  // function. Note that err may be EOF when column lengths are >0, similar to
   433  // the semantics of sliceio.Reader.Read.
   434  //
   435  // If the write function performs I/O, it is recommended that the I/O be
   436  // buffered to allow downstream computations to progress.
   437  //
   438  // WriterFunc provides the function with a zero-value state upon the first
   439  // invocation of the function for a given shard. (If the state argument is a
   440  // pointer, it is allocated.) Subsequent invocations of the function receive
   441  // the same state value, thus permitting the writer to maintain local state
   442  // across the write of the whole shard.
   443  func WriterFunc(slice Slice, write interface{}) Slice {
   444  	s := new(writerFuncSlice)
   445  	s.name = MakeName("writer")
   446  	s.Slice = slice
   447  
   448  	// Our error messages for wrongly-typed write functions include a
   449  	// description of the expected type, which we construct here.
   450  	colTypElems := make([]string, slice.NumOut())
   451  	for i := range colTypElems {
   452  		colTypElems[i] = fmt.Sprintf("col%d %s", i+1, reflect.SliceOf(slice.Out(i)).String())
   453  	}
   454  	colTyps := strings.Join(colTypElems, ", ")
   455  	expectTyp := fmt.Sprintf("func(shard int, state stateType, err error, %s) error", colTyps)
   456  
   457  	die := func(msg string) {
   458  		typecheck.Panicf(2, "writerfunc: invalid writer function type %T; %s", write, msg)
   459  	}
   460  
   461  	fn, ok := slicefunc.Of(write)
   462  	if !ok ||
   463  		fn.In.NumOut() != 3+slice.NumOut() ||
   464  		fn.In.Out(0).Kind() != reflect.Int ||
   465  		fn.In.Out(2) != typeOfError {
   466  		die(fmt.Sprintf("must be %s", expectTyp))
   467  	}
   468  	s.stateType = fn.In.Out(1)
   469  	for i := 0; i < slice.NumOut(); i++ {
   470  		if reflect.SliceOf(slice.Out(i)) != fn.In.Out(i+3) {
   471  			die(fmt.Sprintf("must be %s", expectTyp))
   472  		}
   473  	}
   474  	if fn.Out.NumOut() != 1 || fn.Out.Out(0) != typeOfError {
   475  		die("must return error")
   476  	}
   477  	s.write = fn
   478  	return s
   479  }
   480  
   481  func (s *writerFuncSlice) Name() Name             { return s.name }
   482  func (*writerFuncSlice) NumDep() int              { return 1 }
   483  func (s *writerFuncSlice) Dep(i int) Dep          { return singleDep(i, s.Slice, false) }
   484  func (*writerFuncSlice) Combiner() slicefunc.Func { return slicefunc.Nil }
   485  
   486  type writerFuncReader struct {
   487  	shard     int
   488  	write     slicefunc.Func
   489  	reader    sliceio.Reader
   490  	stateType reflect.Type
   491  	state     reflect.Value
   492  	err       error
   493  }
   494  
   495  func (r *writerFuncReader) callWrite(ctx context.Context, err error, frame frame.Frame) error {
   496  	args := []reflect.Value{reflect.ValueOf(r.shard), r.state}
   497  
   498  	// TODO(jcharumilind): Cache error and column arguments, as they will
   499  	// likely be the same from call to call.
   500  	var errArg reflect.Value
   501  	if err == nil {
   502  		errArg = reflect.Zero(typeOfError)
   503  	} else {
   504  		errArg = reflect.ValueOf(err)
   505  	}
   506  	args = append(args, errArg)
   507  
   508  	args = append(args, frame.Values()...)
   509  	rvs := r.write.Call(ctx, args)
   510  	if e := rvs[0].Interface(); e != nil {
   511  		return e.(error)
   512  	}
   513  	return nil
   514  }
   515  
   516  func (r *writerFuncReader) Read(ctx context.Context, out frame.Frame) (int, error) {
   517  	if r.err != nil {
   518  		return 0, r.err
   519  	}
   520  	if !r.state.IsValid() {
   521  		if r.stateType.Kind() == reflect.Ptr {
   522  			r.state = reflect.New(r.stateType.Elem())
   523  		} else {
   524  			r.state = reflect.Zero(r.stateType)
   525  		}
   526  	}
   527  
   528  	n, err := r.reader.Read(ctx, out)
   529  	werr := r.callWrite(ctx, err, out.Slice(0, n))
   530  	if werr != nil && (err == nil || err == sliceio.EOF) {
   531  		if errors.IsTemporary(werr) {
   532  			err = werr
   533  		} else {
   534  			err = errors.E(errors.Fatal, werr)
   535  		}
   536  	}
   537  	r.err = err
   538  	return n, err
   539  }
   540  
   541  func (s *writerFuncSlice) Reader(shard int, reader []sliceio.Reader) sliceio.Reader {
   542  	return &writerFuncReader{
   543  		shard:     shard,
   544  		write:     s.write,
   545  		reader:    reader[0],
   546  		stateType: s.stateType,
   547  	}
   548  }
   549  
   550  type mapSlice struct {
   551  	name Name
   552  	Pragma
   553  	Slice
   554  	fval slicefunc.Func
   555  }
   556  
   557  // Map transforms a slice by invoking a function for each record. The
   558  // type of slice must match the arguments of the function fn. The
   559  // type of the returned slice is the set of columns returned by fn.
   560  // The returned slice matches the input slice's sharding, but is always
   561  // hash partitioned.
   562  //
   563  // Schematically:
   564  //
   565  //	Map(Slice<t1, t2, ..., tn>, func(v1 t1, v2 t2, ..., vn tn) (r1, r2, ..., rn)) Slice<r1, r2, ..., rn>
   566  func Map(slice Slice, fn interface{}, prags ...Pragma) Slice {
   567  	m := new(mapSlice)
   568  	m.name = MakeName("map")
   569  	m.Slice = slice
   570  	sliceFn, ok := slicefunc.Of(fn)
   571  	if !ok {
   572  		typecheck.Panicf(1, "map: invalid map function %T", fn)
   573  	}
   574  	if !typecheck.CanApply(sliceFn, slice) {
   575  		typecheck.Panicf(1, "map: function %T does not match input slice type %s", fn, slicetype.String(slice))
   576  	}
   577  	if sliceFn.Out.NumOut() == 0 {
   578  		typecheck.Panicf(1, "map: need at least one output column")
   579  	}
   580  	m.fval = sliceFn
   581  	m.Pragma = Pragmas(prags)
   582  	return m
   583  }
   584  
   585  func (m *mapSlice) Name() Name             { return m.name }
   586  func (m *mapSlice) NumOut() int            { return m.fval.Out.NumOut() }
   587  func (m *mapSlice) Out(c int) reflect.Type { return m.fval.Out.Out(c) }
   588  func (*mapSlice) ShardType() ShardType     { return HashShard }
   589  func (*mapSlice) NumDep() int              { return 1 }
   590  func (m *mapSlice) Dep(i int) Dep          { return singleDep(i, m.Slice, false) }
   591  func (*mapSlice) Combiner() slicefunc.Func { return slicefunc.Nil }
   592  
   593  type mapReader struct {
   594  	op     *mapSlice
   595  	reader sliceio.Reader // parent reader
   596  	in     frame.Frame    // buffer for input column vectors
   597  	err    error
   598  }
   599  
   600  func (m *mapReader) Read(ctx context.Context, out frame.Frame) (int, error) {
   601  	if m.err != nil {
   602  		return 0, m.err
   603  	}
   604  	if !slicetype.Assignable(out, m.op) {
   605  		return 0, errTypeError
   606  	}
   607  	n := out.Len()
   608  	if m.in.IsZero() {
   609  		m.in = frame.Make(m.op.Slice, n, n)
   610  	} else {
   611  		m.in = m.in.Ensure(n)
   612  	}
   613  	n, m.err = m.reader.Read(ctx, m.in.Slice(0, n))
   614  	// Now iterate over each record, transform it, and set the output
   615  	// records. Note that we could parallelize the map operation here,
   616  	// but for simplicity, parallelism should be achieved by finer
   617  	// sharding instead, simplifying management of parallel
   618  	// computation.
   619  	//
   620  	// TODO(marius): provide a vectorized version of map for efficiency.
   621  	args := make([]reflect.Value, m.in.NumOut())
   622  	for i := 0; i < n; i++ {
   623  		// Gather the arguments for a single invocation.
   624  		for j := range args {
   625  			args[j] = m.in.Index(j, i)
   626  		}
   627  		// TODO(marius): consider using an unsafe copy here
   628  		result := m.op.fval.Call(ctx, args)
   629  		for j := range result {
   630  			out.Index(j, i).Set(result[j])
   631  		}
   632  	}
   633  	return n, m.err
   634  }
   635  
   636  func (m *mapSlice) Reader(shard int, deps []sliceio.Reader) sliceio.Reader {
   637  	return &mapReader{op: m, reader: deps[0]}
   638  }
   639  
   640  type filterSlice struct {
   641  	name Name
   642  	Pragma
   643  	Slice
   644  	pred slicefunc.Func
   645  }
   646  
   647  // Filter returns a slice where the provided predicate is applied to
   648  // each element in the given slice. The output slice contains only
   649  // those entries for which the predicate is true.
   650  //
   651  // The predicate function should receive each column of slice
   652  // and return a single boolean value.
   653  //
   654  // Schematically:
   655  //
   656  //	Filter(Slice<t1, t2, ..., tn>, func(t1, t2, ..., tn) bool) Slice<t1, t2, ..., tn>
   657  func Filter(slice Slice, pred interface{}, prags ...Pragma) Slice {
   658  	f := new(filterSlice)
   659  	f.name = MakeName("filter")
   660  	f.Slice = slice
   661  	f.Pragma = Pragmas(prags)
   662  	fn, ok := slicefunc.Of(pred)
   663  	if !ok {
   664  		typecheck.Panicf(1, "filter: invalid predicate function %T", pred)
   665  	}
   666  	if !typecheck.CanApply(fn, slice) {
   667  		typecheck.Panicf(1, "filter: function %T does not match input slice type %s", pred, slicetype.String(slice))
   668  	}
   669  	if fn.Out.NumOut() != 1 || fn.Out.Out(0).Kind() != reflect.Bool {
   670  		typecheck.Panic(1, "filter: predicate must return a single boolean value")
   671  	}
   672  	f.pred = fn
   673  	return f
   674  }
   675  
   676  func (f *filterSlice) Name() Name             { return f.name }
   677  func (*filterSlice) NumDep() int              { return 1 }
   678  func (f *filterSlice) Dep(i int) Dep          { return singleDep(i, f.Slice, false) }
   679  func (*filterSlice) Combiner() slicefunc.Func { return slicefunc.Nil }
   680  
   681  type filterReader struct {
   682  	op     *filterSlice
   683  	reader sliceio.Reader
   684  	in     frame.Frame
   685  	err    error
   686  }
   687  
   688  func (f *filterReader) Read(ctx context.Context, out frame.Frame) (n int, err error) {
   689  	if f.err != nil {
   690  		return 0, f.err
   691  	}
   692  	if !slicetype.Assignable(out, f.op) {
   693  		return 0, errTypeError
   694  	}
   695  	var (
   696  		m   int
   697  		max = out.Len()
   698  	)
   699  	args := make([]reflect.Value, out.NumOut())
   700  	for m < max && f.err == nil {
   701  		// TODO(marius): this can get pretty inefficient when the accept
   702  		// rate is low: as we fill the output; we could degenerate into a
   703  		// case where we issue a call for each element. Consider input
   704  		// buffering instead.
   705  		if f.in.IsZero() {
   706  			f.in = frame.Make(f.op, max-m, max-m)
   707  		} else {
   708  			f.in = f.in.Ensure(max - m)
   709  		}
   710  		n, f.err = f.reader.Read(ctx, f.in)
   711  		for i := 0; i < n; i++ {
   712  			for j := range args {
   713  				args[j] = f.in.Value(j).Index(i)
   714  			}
   715  			if f.op.pred.Call(ctx, args)[0].Bool() {
   716  				frame.Copy(out.Slice(m, m+1), f.in.Slice(i, i+1))
   717  				m++
   718  			}
   719  		}
   720  	}
   721  	return m, f.err
   722  }
   723  
   724  func (f *filterSlice) Reader(shard int, deps []sliceio.Reader) sliceio.Reader {
   725  	return &filterReader{op: f, reader: deps[0]}
   726  }
   727  
   728  type flatmapSlice struct {
   729  	name Name
   730  	Pragma
   731  	Slice
   732  	fval slicefunc.Func
   733  	out  slicetype.Type
   734  }
   735  
   736  // Flatmap returns a Slice that applies the function fn to each
   737  // record in the slice, flattening the returned slice. That is, the
   738  // function fn should be of the form:
   739  //
   740  //	func(in1 inType1, in2 inType2, ...) (out1 []outType1, out2 []outType2)
   741  //
   742  // Schematically:
   743  //
   744  //	Flatmap(Slice<t1, t2, ..., tn>, func(v1 t1, v2 t2, ..., vn tn) ([]r1, []r2, ..., []rn)) Slice<r1, r2, ..., rn>
   745  func Flatmap(slice Slice, fn interface{}, prags ...Pragma) Slice {
   746  	f := new(flatmapSlice)
   747  	f.name = MakeName("flatmap")
   748  	f.Slice = slice
   749  	f.Pragma = Pragmas(prags)
   750  	sliceFn, ok := slicefunc.Of(fn)
   751  	if !ok {
   752  		typecheck.Panicf(1, "flatmap: invalid flatmap function %T", fn)
   753  	}
   754  	if !typecheck.CanApply(sliceFn, slice) {
   755  		typecheck.Panicf(1, "flatmap: flatmap function %T does not match input slice type %s", fn, slicetype.String(slice))
   756  	}
   757  	f.out, ok = typecheck.Devectorize(sliceFn.Out)
   758  	if !ok {
   759  		typecheck.Panicf(1, "flatmap: flatmap function %T is not vectorized", fn)
   760  	}
   761  	f.fval = sliceFn
   762  	return f
   763  }
   764  
   765  func (f *flatmapSlice) Name() Name             { return f.name }
   766  func (f *flatmapSlice) NumOut() int            { return f.out.NumOut() }
   767  func (f *flatmapSlice) Out(c int) reflect.Type { return f.out.Out(c) }
   768  func (*flatmapSlice) ShardType() ShardType     { return HashShard }
   769  func (*flatmapSlice) NumDep() int              { return 1 }
   770  func (f *flatmapSlice) Dep(i int) Dep          { return singleDep(i, f.Slice, false) }
   771  func (*flatmapSlice) Combiner() slicefunc.Func { return slicefunc.Nil }
   772  
   773  type flatmapReader struct {
   774  	op     *flatmapSlice
   775  	reader sliceio.Reader // underlying reader
   776  
   777  	in           frame.Frame // buffer of inputs
   778  	begIn, endIn int
   779  	out          frame.Frame // buffer of outputs
   780  	eof          bool
   781  }
   782  
   783  func (f *flatmapReader) Read(ctx context.Context, out frame.Frame) (int, error) {
   784  	if !slicetype.Assignable(out, f.op) {
   785  		return 0, errTypeError
   786  	}
   787  	args := make([]reflect.Value, f.op.Slice.NumOut())
   788  	begOut, endOut := 0, out.Len()
   789  	// Add buffered output from last call, if any.
   790  	if f.out.Len() > 0 {
   791  		n := frame.Copy(out, f.out)
   792  		begOut += n
   793  		f.out = f.out.Slice(n, f.out.Len())
   794  	}
   795  	// Continue as long as we have (possibly buffered) input, and space
   796  	// for output.
   797  	for begOut < endOut && (!f.eof || f.begIn < f.endIn) {
   798  		if f.begIn == f.endIn {
   799  			// out[0].Len() may not be related to an actually useful size, but we'll go with it.
   800  			// TODO(marius): maybe always default to a fixed chunk size? Or
   801  			// dynamically keep track of the average input:output ratio?
   802  			if f.in.IsZero() {
   803  				f.in = frame.Make(f.op.Slice, out.Len(), out.Len())
   804  			} else {
   805  				f.in = f.in.Ensure(out.Len())
   806  			}
   807  			n, err := f.reader.Read(ctx, f.in)
   808  			if err != nil && err != sliceio.EOF {
   809  				return 0, err
   810  			}
   811  			f.begIn, f.endIn = 0, n
   812  			f.eof = err == sliceio.EOF
   813  		}
   814  		// Consume one input at a time, as long as we have space in our
   815  		// output buffer.
   816  		for ; f.begIn < f.endIn && begOut < endOut; f.begIn++ {
   817  			for j := range args {
   818  				args[j] = f.in.Index(j, f.begIn)
   819  			}
   820  			result := frame.Values(f.op.fval.Call(ctx, args))
   821  			n := frame.Copy(out.Slice(begOut, endOut), result)
   822  			begOut += n
   823  			// We've run out of output space. In this case, stash the rest of
   824  			// our output into f.out, if any.
   825  			if m := result.Len(); n < m {
   826  				f.out = result.Slice(n, m)
   827  			}
   828  		}
   829  	}
   830  	var err error
   831  	// We're EOF if we've encountered an EOF from the underlying
   832  	// reader, there's no buffered output, and no buffered input.
   833  	if f.eof && f.out.Len() == 0 && f.begIn == f.endIn {
   834  		err = sliceio.EOF
   835  	}
   836  	return begOut, err
   837  }
   838  
   839  func (f *flatmapSlice) Reader(shard int, deps []sliceio.Reader) sliceio.Reader {
   840  	return &flatmapReader{op: f, reader: deps[0]}
   841  }
   842  
   843  type foldSlice struct {
   844  	name Name
   845  	Slice
   846  	fval slicefunc.Func
   847  	out  slicetype.Type
   848  	dep  Dep
   849  }
   850  
   851  // Fold returns a slice that aggregates values by the first column
   852  // using a custom aggregation function. For an input slice
   853  // Slice<t1, t2, ..., tn>, Fold requires that the provided accumulator
   854  // function follow the form:
   855  //
   856  //	func(accum acctype, v2 t2, ..., vn tn) acctype
   857  //
   858  // The function is invoked once for each slice element with the same
   859  // value for column 1 (t1). On the first invocation, the accumulator
   860  // is passed the zero value of its accumulator type.
   861  //
   862  // Fold requires that the first column of the slice is partitionable.
   863  // See the documentation for Keyer for more details.
   864  //
   865  // Schematically:
   866  //
   867  //	Fold(Slice<t1, t2, ..., tn>, func(accum acctype, v2 t2, ..., vn tn) acctype) Slice<t1, acctype>
   868  //
   869  // BUG(marius): Fold does not yet support slice grouping
   870  func Fold(slice Slice, fold interface{}) Slice {
   871  	if n := slice.NumOut(); n < 2 {
   872  		typecheck.Panicf(1, "Fold can be applied only for slices with at least two columns; got %d", n)
   873  	}
   874  	if !frame.CanHash(slice.Out(0)) {
   875  		typecheck.Panicf(1, "fold: key type %s is not partitionable", slice.Out(0))
   876  	}
   877  	if !canMakeAccumulatorForKey(slice.Out(0)) {
   878  		typecheck.Panicf(1, "fold: key type %s cannot be accumulated", slice.Out(0))
   879  	}
   880  	f := new(foldSlice)
   881  	f.name = MakeName("fold")
   882  	f.Slice = slice
   883  	// Fold requires shuffle by the first column.
   884  	// TODO(marius): allow deps to express shuffling by other columns.
   885  	f.dep = Dep{slice, true, nil, false}
   886  
   887  	fn, ok := slicefunc.Of(fold)
   888  	if !ok {
   889  		typecheck.Panicf(1, "fold: invalid fold function %T", fold)
   890  	}
   891  	if fn.Out.NumOut() != 1 {
   892  		typecheck.Panicf(1, "fold: fold functions must return exactly one value")
   893  	}
   894  	// func(acc, t2, t3, ..., tn)
   895  	if got, want := fn.In, slicetype.Append(fn.Out, slicetype.Slice(slice, 1, slice.NumOut())); !typecheck.Equal(got, want) {
   896  		typecheck.Panicf(1, "fold: expected func(acc, t2, t3, ..., tn), got %T", fold)
   897  	}
   898  	f.fval = fn
   899  	// output: key, accumulator
   900  	f.out = slicetype.New(slice.Out(0), fn.Out.Out(0))
   901  	return f
   902  }
   903  
   904  func (f *foldSlice) Name() Name             { return f.name }
   905  func (f *foldSlice) NumOut() int            { return f.out.NumOut() }
   906  func (f *foldSlice) Out(c int) reflect.Type { return f.out.Out(c) }
   907  func (*foldSlice) NumDep() int              { return 1 }
   908  func (f *foldSlice) Dep(i int) Dep          { return f.dep }
   909  func (*foldSlice) Combiner() slicefunc.Func { return slicefunc.Nil }
   910  
   911  type foldReader struct {
   912  	op     *foldSlice
   913  	reader sliceio.Reader
   914  	accum  Accumulator
   915  	err    error
   916  }
   917  
   918  // Compute accumulates values across all keys in this shard. The entire
   919  // output is buffered in memory.
   920  func (f *foldReader) compute(ctx context.Context) (Accumulator, error) {
   921  	in := frame.Make(f.op.dep, defaultChunksize, defaultChunksize)
   922  	accum := makeAccumulator(f.op.dep.Out(0), f.op.out.Out(1), f.op.fval)
   923  	for {
   924  		n, err := f.reader.Read(ctx, in)
   925  		if err != nil && err != sliceio.EOF {
   926  			return nil, err
   927  		}
   928  		accum.Accumulate(in, n)
   929  		if err == sliceio.EOF {
   930  			return accum, nil
   931  		}
   932  	}
   933  }
   934  
   935  func (f *foldReader) Read(ctx context.Context, out frame.Frame) (int, error) {
   936  	if f.err != nil {
   937  		return 0, f.err
   938  	}
   939  	if !slicetype.Assignable(out, f.op) {
   940  		return 0, errTypeError
   941  	}
   942  	if f.accum == nil {
   943  		f.accum, f.err = f.compute(ctx)
   944  		if f.err != nil {
   945  			return 0, f.err
   946  		}
   947  	}
   948  	var n int
   949  	n, f.err = f.accum.Read(out.Value(0), out.Value(1))
   950  	return n, f.err
   951  }
   952  
   953  func (f *foldSlice) Reader(shard int, deps []sliceio.Reader) sliceio.Reader {
   954  	return &foldReader{op: f, reader: deps[0]}
   955  }
   956  
   957  type headSlice struct {
   958  	name Name
   959  	Slice
   960  	n int
   961  }
   962  
   963  // Head returns a slice that returns at most the first n items from
   964  // each shard of the underlying slice. Its type is the same as the
   965  // provided slice.
   966  func Head(slice Slice, n int) Slice {
   967  	return &headSlice{MakeName(fmt.Sprintf("head(%d)", n)), slice, n}
   968  }
   969  
   970  func (h *headSlice) Name() Name             { return h.name }
   971  func (*headSlice) NumDep() int              { return 1 }
   972  func (h *headSlice) Dep(i int) Dep          { return singleDep(i, h.Slice, false) }
   973  func (*headSlice) Combiner() slicefunc.Func { return slicefunc.Nil }
   974  
   975  type headReader struct {
   976  	reader sliceio.Reader
   977  	n      int
   978  }
   979  
   980  func (h headSlice) Reader(shard int, deps []sliceio.Reader) sliceio.Reader {
   981  	return &headReader{deps[0], h.n}
   982  }
   983  
   984  func (h *headReader) Read(ctx context.Context, out frame.Frame) (n int, err error) {
   985  	if h.n <= 0 {
   986  		return 0, sliceio.EOF
   987  	}
   988  	n, err = h.reader.Read(ctx, out)
   989  	h.n -= n
   990  	if h.n < 0 {
   991  		n -= -h.n
   992  	}
   993  	return
   994  }
   995  
   996  type scanSlice struct {
   997  	name Name
   998  	Slice
   999  	scan func(shard int, scanner *sliceio.Scanner) error
  1000  }
  1001  
  1002  // Scan invokes a function for each shard of the input Slice.
  1003  // It returns a unit Slice: Scan is inteded to be used for its side
  1004  // effects.
  1005  func Scan(slice Slice, scan func(shard int, scanner *sliceio.Scanner) error) Slice {
  1006  	return &scanSlice{MakeName("scan"), slice, scan}
  1007  }
  1008  
  1009  func (s *scanSlice) Name() Name             { return s.name }
  1010  func (*scanSlice) NumOut() int              { return 0 }
  1011  func (*scanSlice) Out(c int) reflect.Type   { panic(c) }
  1012  func (*scanSlice) NumDep() int              { return 1 }
  1013  func (s *scanSlice) Dep(i int) Dep          { return singleDep(i, s.Slice, false) }
  1014  func (*scanSlice) Combiner() slicefunc.Func { return slicefunc.Nil }
  1015  
  1016  type scanReader struct {
  1017  	slice  scanSlice
  1018  	shard  int
  1019  	reader sliceio.Reader
  1020  }
  1021  
  1022  func (s *scanReader) Read(ctx context.Context, out frame.Frame) (n int, err error) {
  1023  	err = s.slice.scan(s.shard, sliceio.NewScanner(s.slice.Slice, sliceio.NopCloser(s.reader)))
  1024  	if err == nil {
  1025  		err = sliceio.EOF
  1026  	}
  1027  	return 0, err
  1028  }
  1029  
  1030  func (s scanSlice) Reader(shard int, deps []sliceio.Reader) sliceio.Reader {
  1031  	return &scanReader{s, shard, deps[0]}
  1032  }
  1033  
  1034  type prefixSlice struct {
  1035  	Pragma
  1036  	Slice
  1037  	prefix int
  1038  }
  1039  
  1040  // Prefixed returns a slice with the provided prefix. A prefix determines
  1041  // the number of columns (starting at 0) in the slice that compose the
  1042  // key values for that slice for operations like reduce. For example, prefix of 2
  1043  // means that columns 0 and 1 are the key.
  1044  func Prefixed(slice Slice, prefix int) Slice {
  1045  	if prefix < 1 {
  1046  		typecheck.Panic(1, "prefixed: prefix must include at least one column")
  1047  	}
  1048  	if prefix > slice.NumOut() {
  1049  		typecheck.Panicf(1, "prefixed: prefix %d is greater than number of columns %d", prefix, slice.NumOut())
  1050  	}
  1051  	var pragma Pragma = Pragmas{}
  1052  	if slicePragma, ok := slice.(Pragma); ok {
  1053  		pragma = slicePragma
  1054  	}
  1055  	return &prefixSlice{pragma, slice, prefix}
  1056  }
  1057  
  1058  func (p *prefixSlice) Prefix() int { return p.prefix }
  1059  
  1060  // Unwrap returns the underlying slice if the provided slice is used
  1061  // only to amend the type of the slice it composes.
  1062  //
  1063  // TODO(marius): this is required to properly compile slices that use the
  1064  // prefix combinator; we should have a more general and robust solution
  1065  // to this.
  1066  func Unwrap(slice Slice) Slice {
  1067  	if slice, ok := slice.(*prefixSlice); ok {
  1068  		return Unwrap(slice.Slice)
  1069  	}
  1070  	return slice
  1071  }
  1072  
  1073  // String returns a string describing the slice and its type.
  1074  func String(slice Slice) string {
  1075  	types := make([]string, slice.NumOut())
  1076  	for i := range types {
  1077  		types[i] = fmt.Sprint(slice.Out(i))
  1078  	}
  1079  	return fmt.Sprintf("%s<%s>", slice.Name().Op, strings.Join(types, ", "))
  1080  }
  1081  
  1082  func singleDep(i int, slice Slice, shuffle bool) Dep {
  1083  	if i != 0 {
  1084  		panic(fmt.Sprintf("invalid dependency %d", i))
  1085  	}
  1086  	return Dep{slice, shuffle, nil, false}
  1087  }
  1088  
  1089  var (
  1090  	helperMu sync.Mutex
  1091  	helpers  = make(map[string]bool)
  1092  )
  1093  
  1094  // Helper is used to mark a function as a helper function: names for
  1095  // newly created slices will be attributed to the caller of the
  1096  // function instead of the function itself.
  1097  func Helper() {
  1098  	helperMu.Lock()
  1099  	defer helperMu.Unlock()
  1100  	helpers[callerFunc(1)] = true
  1101  }
  1102  
  1103  func callerFunc(skip int) string {
  1104  	var pc [2]uintptr
  1105  	n := runtime.Callers(skip+2, pc[:]) // skip + runtime.Callers + callerFunc
  1106  	if n == 0 {
  1107  		panic("bigslice: zero callers found")
  1108  	}
  1109  	frames := runtime.CallersFrames(pc[:n])
  1110  	frame, _ := frames.Next()
  1111  	return frame.Function
  1112  }
  1113  
  1114  // Name is a unique name for a slice, constructed with useful context for
  1115  // diagnostic or status display.
  1116  type Name struct {
  1117  	// Op is the operation that the slice performs (e.g. "reduce", "map")
  1118  	Op string
  1119  	// File is the file in which the slice was defined.
  1120  	File string
  1121  	// Line is the line in File at which the slice was defined.
  1122  	Line int
  1123  	// Index disambiguates slices created on the same File and Line.
  1124  	Index int
  1125  }
  1126  
  1127  func (n Name) String() string {
  1128  	return fmt.Sprintf("%s@%s:%d", n.Op, n.File, n.Line)
  1129  }
  1130  
  1131  func MakeName(op string) Name {
  1132  	// Presume the correct frame is the caller of makeName,
  1133  	// but skip to the frame before the last helper, if any.
  1134  	var pc [50]uintptr             // consider at most 50 frames
  1135  	n := runtime.Callers(3, pc[:]) // caller of makeName, makeName, runtime.Callers.
  1136  	if n == 0 {
  1137  		panic("bigslice: no callers found")
  1138  	}
  1139  	frames := runtime.CallersFrames(pc[:n])
  1140  	helperMu.Lock()
  1141  	var found runtime.Frame
  1142  	for more := true; more; {
  1143  		var frame runtime.Frame
  1144  		frame, more = frames.Next()
  1145  		if found.PC == 0 {
  1146  			found = frame
  1147  		}
  1148  		if helpers[frame.Function] {
  1149  			found = runtime.Frame{}
  1150  		}
  1151  	}
  1152  	helperMu.Unlock()
  1153  	index := newNameIndex(op, found.File, found.Line)
  1154  	return Name{op, found.File, found.Line, index}
  1155  }
  1156  
  1157  type sliceNameIndexerKey struct {
  1158  	op   string
  1159  	file string
  1160  	line int
  1161  }
  1162  
  1163  var sliceNameIndexerMu sync.Mutex
  1164  var sliceNameIndexerMap = make(map[sliceNameIndexerKey]int)
  1165  
  1166  func newNameIndex(op, file string, line int) int {
  1167  	key := sliceNameIndexerKey{op, file, line}
  1168  	sliceNameIndexerMu.Lock()
  1169  	defer sliceNameIndexerMu.Unlock()
  1170  	c := sliceNameIndexerMap[key]
  1171  	sliceNameIndexerMap[key]++
  1172  	return c
  1173  }