github.com/grailbio/bigslice@v0.0.0-20230519005545-30c4c12152ad/slice.go (about) 1 // Copyright 2018 GRAIL, Inc. All rights reserved. 2 // Use of this source code is governed by the Apache 2.0 3 // license that can be found in the LICENSE file. 4 5 package bigslice 6 7 import ( 8 "context" 9 "fmt" 10 "reflect" 11 "runtime" 12 "strings" 13 "sync" 14 15 "github.com/grailbio/base/errors" 16 "github.com/grailbio/base/log" 17 "github.com/grailbio/bigslice/frame" 18 "github.com/grailbio/bigslice/internal/defaultsize" 19 "github.com/grailbio/bigslice/slicefunc" 20 "github.com/grailbio/bigslice/sliceio" 21 "github.com/grailbio/bigslice/slicetype" 22 "github.com/grailbio/bigslice/typecheck" 23 ) 24 25 var typeOfError = reflect.TypeOf((*error)(nil)).Elem() 26 27 // DefaultChunkSize is the default size used for IO vectors throughout bigslice. 28 var defaultChunksize = defaultsize.Chunk 29 30 var errTypeError = errors.New("type error") 31 32 // A Dep is a Slice dependency. Deps comprise a slice and a boolean flag 33 // determining whether this is represents a shuffle dependency. Shuffle 34 // dependencies must perform a data shuffle step: the dependency must partition 35 // its output according to the Slice's partitioner, and, when the dependent 36 // Slice is computed, the evaluator must pass in Readers that read a single 37 // partition from all dependent shards. If Shuffle is true, then the provided 38 // partitioner determines how the output is partitioned. If it is nil, the 39 // default (hash by first column) partitioner is used. 40 type Dep struct { 41 Slice 42 Shuffle bool 43 Partitioner Partitioner 44 // Expand indicates that each shard of a shuffle dependency (i.e., 45 // all the shards of a given partition) should be expanded (i.e., 46 // not merged) when handed to the slice implementation. This is to 47 // support merge-sorting of shards of the same partition. 48 Expand bool 49 } 50 51 // ShardType indicates the type of sharding used by a Slice. 52 type ShardType int 53 54 const ( 55 // HashShard Slices are partitioned by an (unspecified) 56 // hash of an record. That is, the same record should 57 // be assigned a stable shard number. 58 HashShard ShardType = iota 59 // RangeShard Slices are partitioned by the range of a key. The key 60 // is always the first column of the slice. 61 RangeShard 62 ) 63 64 // A Partitioner is used to assign partitions to rows in a frame. 65 type Partitioner func(ctx context.Context, frame frame.Frame, nshard int, shards []int) 66 67 // A Slice is a shardable, ordered dataset. Each slice consists of zero or more 68 // columns of data distributed over one or more shards. Slices may declare 69 // dependencies on other slices from which it is computed. In order to compute 70 // a slice, its dependencies must first be computed, and their resulting 71 // Readers are passed to a Slice's Reader method. 72 // 73 // Since Go does not support generic typing, Slice combinators perform their 74 // own dynamic type checking. Schematically we write the n-ary slice with types 75 // t1, t2, ..., tn as Slice<t1, t2, ..., tn>. 76 // 77 // Types that implement the Slice interface must be comparable. 78 type Slice interface { 79 slicetype.Type 80 81 // Name returns a unique (composite) name for this Slice that also has 82 // useful context for diagnostic or status display. 83 Name() Name 84 85 // NumShard returns the number of shards in this Slice. 86 NumShard() int 87 // ShardType returns the sharding type of this Slice. 88 ShardType() ShardType 89 90 // NumDep returns the number of dependencies of this Slice. 91 NumDep() int 92 // Dep returns the i'th dependency for this Slice. 93 Dep(i int) Dep 94 95 // Combiner is an optional function that is used to combine multiple values 96 // with the same key from the slice's output. No combination is performed 97 // if Nil. 98 Combiner() slicefunc.Func 99 100 // Reader returns a Reader for a shard of this Slice. The reader itself 101 // computes the shard's values on demand. The caller must provide Readers 102 // for all of this shard's dependencies, constructed according to the 103 // dependency type (see Dep). 104 Reader(shard int, deps []sliceio.Reader) sliceio.Reader 105 } 106 107 // Pragma comprises runtime directives used during bigslice 108 // execution. 109 type Pragma interface { 110 // Procs returns the number of procs a slice task needs to run. It is 111 // superceded by Exclusive and clamped to the maximum number of procs per 112 // machine. 113 Procs() int 114 // Exclusive indicates that a slice task should be given 115 // exclusive access to the underlying machine. 116 Exclusive() bool 117 // Materialize indicates that the result of the slice task should be 118 // materialized, i.e. break pipelining. 119 Materialize() bool 120 } 121 122 // Pragmas composes multiple underlying Pragmas. 123 type Pragmas []Pragma 124 125 // Procs implements Pragma. If multiple tasks with Procs pragmas are pipelined, 126 // we allocate the maximum to the composed pipeline. 127 func (p Pragmas) Procs() int { 128 need := 1 129 for _, q := range p { 130 n := q.Procs() 131 if n > need { 132 need = n 133 } 134 } 135 return need 136 } 137 138 // Exclusive implements Pragma. 139 func (p Pragmas) Exclusive() bool { 140 for _, q := range p { 141 if q.Exclusive() { 142 return true 143 } 144 } 145 return false 146 } 147 148 // Materialize implements Pragma. 149 func (p Pragmas) Materialize() bool { 150 for _, q := range p { 151 if q.Materialize() { 152 return true 153 } 154 } 155 return false 156 } 157 158 type exclusive struct{} 159 160 func (exclusive) Procs() int { return 1 } 161 func (exclusive) Exclusive() bool { return true } 162 func (exclusive) Materialize() bool { return false } 163 164 // Exclusive is a Pragma that indicates the slice task should be given 165 // exclusive access to the machine that runs it. Exclusive takes precedence 166 // over Procs. 167 var Exclusive Pragma = exclusive{} 168 169 type materialize struct{} 170 171 func (materialize) Procs() int { return 1 } 172 func (materialize) Exclusive() bool { return false } 173 func (materialize) Materialize() bool { return true } 174 175 // ExperimentalMaterialize is a Pragma that indicates the slice task results 176 // should be materialized, i.e. not pipelined. You may want to use this to 177 // materialize and reuse results of tasks that would normally have been 178 // pipelined. 179 // 180 // It is tagged "experimental" because we are considering other ways of 181 // achieving this. 182 // 183 // TODO(jcharumilind): Consider doing this automatically for slices on which 184 // multiple slices depend. 185 var ExperimentalMaterialize Pragma = materialize{} 186 187 type procs struct { 188 n int 189 } 190 191 func (p procs) Procs() int { return p.n } 192 func (procs) Exclusive() bool { return false } 193 func (procs) Materialize() bool { return false } 194 195 // Procs returns a pragma that sets the number of procs a slice task needs to 196 // run to n. It is superceded by Exclusive and clamped to the maximum number of 197 // procs per machine. 198 func Procs(n int) Pragma { 199 return procs{n: n} 200 } 201 202 type constSlice struct { 203 name Name 204 slicetype.Type 205 frame frame.Frame 206 nshard int 207 } 208 209 // Const returns a Slice representing the provided value. Each column 210 // of the Slice should be provided as a Go slice of the column's 211 // type. The value is split into nshard shards. 212 func Const(nshard int, columns ...interface{}) Slice { 213 if len(columns) == 0 { 214 typecheck.Panic(1, "const: must have at least one column") 215 } 216 s := new(constSlice) 217 s.name = MakeName("const") 218 s.nshard = nshard 219 if s.nshard < 1 { 220 typecheck.Panic(1, "const: shard must be >= 1") 221 } 222 var ok bool 223 s.Type, ok = typecheck.Slices(columns...) 224 if !ok { 225 typecheck.Panic(1, "const: invalid slice inputs") 226 } 227 // TODO(marius): convert panic to a typecheck panic 228 s.frame = frame.Slices(columns...) 229 return s 230 } 231 232 func (s *constSlice) Name() Name { return s.name } 233 func (*constSlice) Prefix() int { return 1 } 234 func (s *constSlice) NumShard() int { return s.nshard } 235 func (*constSlice) ShardType() ShardType { return HashShard } 236 func (*constSlice) NumDep() int { return 0 } 237 func (*constSlice) Dep(i int) Dep { panic("no deps") } 238 func (*constSlice) Combiner() slicefunc.Func { return slicefunc.Nil } 239 240 type constReader struct { 241 op *constSlice 242 frame frame.Frame 243 shard int 244 } 245 246 func (s *constReader) Read(ctx context.Context, out frame.Frame) (int, error) { 247 if !slicetype.Assignable(s.op, out) { 248 return 0, errTypeError 249 } 250 n := frame.Copy(out, s.frame) 251 m := s.frame.Len() 252 s.frame = s.frame.Slice(n, m) 253 if m == 0 { 254 return n, sliceio.EOF 255 } 256 return n, nil 257 } 258 259 // constShard computes the offset to and count of rows in the const data for a 260 // given shard. n is the total number of rows in the data. nshard is the total 261 // number of shards. constShard distributes data evenly. The difference in 262 // count between one shard and another will be at most one. 263 func constShard(n, nshard, shard int) (offset, count int) { 264 var ( 265 quot = n / nshard 266 rem = n % nshard 267 ) 268 offset = quot * shard 269 count = quot 270 if shard < rem { 271 offset += shard 272 count++ 273 } else { 274 offset += rem 275 } 276 return offset, count 277 } 278 279 func (s *constSlice) Reader(shard int, deps []sliceio.Reader) sliceio.Reader { 280 offset, count := constShard(s.frame.Len(), s.nshard, shard) 281 if count == 0 { 282 return sliceio.EmptyReader{} 283 } 284 r := &constReader{ 285 op: s, 286 frame: s.frame.Slice(offset, offset+count), 287 shard: shard, 288 } 289 return r 290 } 291 292 type readerFuncSlice struct { 293 name Name 294 Pragma 295 slicetype.Type 296 nshard int 297 read slicefunc.Func 298 stateType reflect.Type 299 } 300 301 // ReaderFunc returns a Slice that uses the provided function to read 302 // data. The function read must be of the form: 303 // 304 // func(shard int, state stateType, col1 []col1Type, col2 []col2Type, ..., colN []colNType) (int, error) 305 // 306 // This returns a slice of the form: 307 // 308 // Slice<col1Type, col2Type, ..., colNType> 309 // 310 // The function is invoked to fill a vector of elements. col1, ..., 311 // colN are preallocated slices that should be filled by the reader 312 // function. The function should return the number of elements that 313 // were filled. The error EOF should be returned when no more data 314 // are available. 315 // 316 // ReaderFunc provides the function with a zero-value state upon the 317 // first invocation of the function for a given shard. (If the state 318 // argument is a pointer, it is allocated.) Subsequent invocations of 319 // the function receive the same state value, thus permitting the 320 // reader to maintain local state across the read of a whole shard. 321 func ReaderFunc(nshard int, read interface{}, prags ...Pragma) Slice { 322 s := new(readerFuncSlice) 323 s.name = MakeName("reader") 324 s.nshard = nshard 325 fn, ok := slicefunc.Of(read) 326 if !ok || fn.In.NumOut() < 3 || fn.In.Out(0).Kind() != reflect.Int { 327 typecheck.Panicf(1, "readerfunc: invalid reader function type %T", read) 328 } 329 if fn.Out.Out(0).Kind() != reflect.Int || fn.Out.Out(1) != typeOfError { 330 typecheck.Panicf(1, "readerfunc: function %T does not return (int, error)", read) 331 } 332 s.stateType = fn.In.Out(1) 333 arg := slicetype.Slice(fn.In, 2, fn.In.NumOut()) 334 if s.Type, ok = typecheck.Devectorize(arg); !ok { 335 typecheck.Panicf(1, "readerfunc: function %T is not vectorized", read) 336 } 337 s.read = fn 338 s.Pragma = Pragmas(prags) 339 return s 340 } 341 342 func (r *readerFuncSlice) Name() Name { return r.name } 343 func (*readerFuncSlice) Prefix() int { return 1 } 344 func (r *readerFuncSlice) NumShard() int { return r.nshard } 345 func (*readerFuncSlice) ShardType() ShardType { return HashShard } 346 func (*readerFuncSlice) NumDep() int { return 0 } 347 func (*readerFuncSlice) Dep(i int) Dep { panic("no deps") } 348 func (*readerFuncSlice) Combiner() slicefunc.Func { return slicefunc.Nil } 349 350 type readerFuncSliceReader struct { 351 op *readerFuncSlice 352 state reflect.Value 353 shard int 354 err error 355 356 // consecutiveEmptyCalls counts how many times op.read returned 0 elements consecutively. 357 // Many empty calls may mean the user forgot to return sliceio.EOF, so we log a warning. 358 consecutiveEmptyCalls int 359 } 360 361 func (r *readerFuncSliceReader) Read(ctx context.Context, out frame.Frame) (n int, err error) { 362 if r.err != nil { 363 return 0, r.err 364 } 365 if !slicetype.Assignable(out, r.op) { 366 return 0, errTypeError 367 } 368 // Initialize state (on first call) 369 if !r.state.IsValid() { 370 if r.op.stateType.Kind() == reflect.Ptr { 371 r.state = reflect.New(r.op.stateType.Elem()) 372 } else { 373 r.state = reflect.Zero(r.op.stateType) 374 } 375 } 376 // out is passed to a user, zero it. 377 out.Zero() 378 rvs := r.op.read.Call(ctx, append([]reflect.Value{reflect.ValueOf(r.shard), r.state}, out.Values()...)) 379 n = int(rvs[0].Int()) 380 if n == 0 { 381 r.consecutiveEmptyCalls++ 382 if r.consecutiveEmptyCalls > 7 && r.consecutiveEmptyCalls&(r.consecutiveEmptyCalls-1) == 0 { 383 log.Printf("warning: reader func returned empty vector %d consecutive times; "+ 384 "don't forget sliceio.EOF", r.consecutiveEmptyCalls) 385 } 386 } else { 387 r.consecutiveEmptyCalls = 0 388 } 389 if e := rvs[1].Interface(); e != nil { 390 if err := e.(error); err == sliceio.EOF || errors.IsTemporary(err) { 391 r.err = err 392 } else { 393 // We consider all application-generated errors as Fatal unless marked otherwise. 394 r.err = errors.E(errors.Fatal, err) 395 } 396 } 397 return n, r.err 398 } 399 400 func (r *readerFuncSlice) Reader(shard int, reader []sliceio.Reader) sliceio.Reader { 401 return &readerFuncSliceReader{op: r, shard: shard} 402 } 403 404 type writerFuncSlice struct { 405 name Name 406 Slice 407 stateType reflect.Type 408 write slicefunc.Func 409 } 410 411 // WriterFunc returns a Slice that is functionally equivalent to the input 412 // Slice, allowing for computation with side effects by the provided write 413 // function. The write function must be of the form: 414 // 415 // func(shard int, state stateType, err error, col1 []col1Type, col2 []col2Type, ..., colN []colNType) error 416 // 417 // where the input slice is of the form: 418 // 419 // Slice<col1Type, col2Type, ..., colNType> 420 // 421 // The write function is invoked with every read of the input Slice. Each 422 // column slice will be of the same length and will be populated with the data 423 // from the read. For performance, the passed column slices share memory with 424 // the internal frame of the read. Do not modify the data in them, and assume 425 // that they will be modified once write returns. 426 // 427 // The write function should return a non-nil error if there is a problem 428 // writing, e.g. the write function encounters and error while writing to a 429 // file. It should otherwise return nil. 430 // 431 // Any error from the read, including EOF, will be passed as err to the write 432 // function. Note that err may be EOF when column lengths are >0, similar to 433 // the semantics of sliceio.Reader.Read. 434 // 435 // If the write function performs I/O, it is recommended that the I/O be 436 // buffered to allow downstream computations to progress. 437 // 438 // WriterFunc provides the function with a zero-value state upon the first 439 // invocation of the function for a given shard. (If the state argument is a 440 // pointer, it is allocated.) Subsequent invocations of the function receive 441 // the same state value, thus permitting the writer to maintain local state 442 // across the write of the whole shard. 443 func WriterFunc(slice Slice, write interface{}) Slice { 444 s := new(writerFuncSlice) 445 s.name = MakeName("writer") 446 s.Slice = slice 447 448 // Our error messages for wrongly-typed write functions include a 449 // description of the expected type, which we construct here. 450 colTypElems := make([]string, slice.NumOut()) 451 for i := range colTypElems { 452 colTypElems[i] = fmt.Sprintf("col%d %s", i+1, reflect.SliceOf(slice.Out(i)).String()) 453 } 454 colTyps := strings.Join(colTypElems, ", ") 455 expectTyp := fmt.Sprintf("func(shard int, state stateType, err error, %s) error", colTyps) 456 457 die := func(msg string) { 458 typecheck.Panicf(2, "writerfunc: invalid writer function type %T; %s", write, msg) 459 } 460 461 fn, ok := slicefunc.Of(write) 462 if !ok || 463 fn.In.NumOut() != 3+slice.NumOut() || 464 fn.In.Out(0).Kind() != reflect.Int || 465 fn.In.Out(2) != typeOfError { 466 die(fmt.Sprintf("must be %s", expectTyp)) 467 } 468 s.stateType = fn.In.Out(1) 469 for i := 0; i < slice.NumOut(); i++ { 470 if reflect.SliceOf(slice.Out(i)) != fn.In.Out(i+3) { 471 die(fmt.Sprintf("must be %s", expectTyp)) 472 } 473 } 474 if fn.Out.NumOut() != 1 || fn.Out.Out(0) != typeOfError { 475 die("must return error") 476 } 477 s.write = fn 478 return s 479 } 480 481 func (s *writerFuncSlice) Name() Name { return s.name } 482 func (*writerFuncSlice) NumDep() int { return 1 } 483 func (s *writerFuncSlice) Dep(i int) Dep { return singleDep(i, s.Slice, false) } 484 func (*writerFuncSlice) Combiner() slicefunc.Func { return slicefunc.Nil } 485 486 type writerFuncReader struct { 487 shard int 488 write slicefunc.Func 489 reader sliceio.Reader 490 stateType reflect.Type 491 state reflect.Value 492 err error 493 } 494 495 func (r *writerFuncReader) callWrite(ctx context.Context, err error, frame frame.Frame) error { 496 args := []reflect.Value{reflect.ValueOf(r.shard), r.state} 497 498 // TODO(jcharumilind): Cache error and column arguments, as they will 499 // likely be the same from call to call. 500 var errArg reflect.Value 501 if err == nil { 502 errArg = reflect.Zero(typeOfError) 503 } else { 504 errArg = reflect.ValueOf(err) 505 } 506 args = append(args, errArg) 507 508 args = append(args, frame.Values()...) 509 rvs := r.write.Call(ctx, args) 510 if e := rvs[0].Interface(); e != nil { 511 return e.(error) 512 } 513 return nil 514 } 515 516 func (r *writerFuncReader) Read(ctx context.Context, out frame.Frame) (int, error) { 517 if r.err != nil { 518 return 0, r.err 519 } 520 if !r.state.IsValid() { 521 if r.stateType.Kind() == reflect.Ptr { 522 r.state = reflect.New(r.stateType.Elem()) 523 } else { 524 r.state = reflect.Zero(r.stateType) 525 } 526 } 527 528 n, err := r.reader.Read(ctx, out) 529 werr := r.callWrite(ctx, err, out.Slice(0, n)) 530 if werr != nil && (err == nil || err == sliceio.EOF) { 531 if errors.IsTemporary(werr) { 532 err = werr 533 } else { 534 err = errors.E(errors.Fatal, werr) 535 } 536 } 537 r.err = err 538 return n, err 539 } 540 541 func (s *writerFuncSlice) Reader(shard int, reader []sliceio.Reader) sliceio.Reader { 542 return &writerFuncReader{ 543 shard: shard, 544 write: s.write, 545 reader: reader[0], 546 stateType: s.stateType, 547 } 548 } 549 550 type mapSlice struct { 551 name Name 552 Pragma 553 Slice 554 fval slicefunc.Func 555 } 556 557 // Map transforms a slice by invoking a function for each record. The 558 // type of slice must match the arguments of the function fn. The 559 // type of the returned slice is the set of columns returned by fn. 560 // The returned slice matches the input slice's sharding, but is always 561 // hash partitioned. 562 // 563 // Schematically: 564 // 565 // Map(Slice<t1, t2, ..., tn>, func(v1 t1, v2 t2, ..., vn tn) (r1, r2, ..., rn)) Slice<r1, r2, ..., rn> 566 func Map(slice Slice, fn interface{}, prags ...Pragma) Slice { 567 m := new(mapSlice) 568 m.name = MakeName("map") 569 m.Slice = slice 570 sliceFn, ok := slicefunc.Of(fn) 571 if !ok { 572 typecheck.Panicf(1, "map: invalid map function %T", fn) 573 } 574 if !typecheck.CanApply(sliceFn, slice) { 575 typecheck.Panicf(1, "map: function %T does not match input slice type %s", fn, slicetype.String(slice)) 576 } 577 if sliceFn.Out.NumOut() == 0 { 578 typecheck.Panicf(1, "map: need at least one output column") 579 } 580 m.fval = sliceFn 581 m.Pragma = Pragmas(prags) 582 return m 583 } 584 585 func (m *mapSlice) Name() Name { return m.name } 586 func (m *mapSlice) NumOut() int { return m.fval.Out.NumOut() } 587 func (m *mapSlice) Out(c int) reflect.Type { return m.fval.Out.Out(c) } 588 func (*mapSlice) ShardType() ShardType { return HashShard } 589 func (*mapSlice) NumDep() int { return 1 } 590 func (m *mapSlice) Dep(i int) Dep { return singleDep(i, m.Slice, false) } 591 func (*mapSlice) Combiner() slicefunc.Func { return slicefunc.Nil } 592 593 type mapReader struct { 594 op *mapSlice 595 reader sliceio.Reader // parent reader 596 in frame.Frame // buffer for input column vectors 597 err error 598 } 599 600 func (m *mapReader) Read(ctx context.Context, out frame.Frame) (int, error) { 601 if m.err != nil { 602 return 0, m.err 603 } 604 if !slicetype.Assignable(out, m.op) { 605 return 0, errTypeError 606 } 607 n := out.Len() 608 if m.in.IsZero() { 609 m.in = frame.Make(m.op.Slice, n, n) 610 } else { 611 m.in = m.in.Ensure(n) 612 } 613 n, m.err = m.reader.Read(ctx, m.in.Slice(0, n)) 614 // Now iterate over each record, transform it, and set the output 615 // records. Note that we could parallelize the map operation here, 616 // but for simplicity, parallelism should be achieved by finer 617 // sharding instead, simplifying management of parallel 618 // computation. 619 // 620 // TODO(marius): provide a vectorized version of map for efficiency. 621 args := make([]reflect.Value, m.in.NumOut()) 622 for i := 0; i < n; i++ { 623 // Gather the arguments for a single invocation. 624 for j := range args { 625 args[j] = m.in.Index(j, i) 626 } 627 // TODO(marius): consider using an unsafe copy here 628 result := m.op.fval.Call(ctx, args) 629 for j := range result { 630 out.Index(j, i).Set(result[j]) 631 } 632 } 633 return n, m.err 634 } 635 636 func (m *mapSlice) Reader(shard int, deps []sliceio.Reader) sliceio.Reader { 637 return &mapReader{op: m, reader: deps[0]} 638 } 639 640 type filterSlice struct { 641 name Name 642 Pragma 643 Slice 644 pred slicefunc.Func 645 } 646 647 // Filter returns a slice where the provided predicate is applied to 648 // each element in the given slice. The output slice contains only 649 // those entries for which the predicate is true. 650 // 651 // The predicate function should receive each column of slice 652 // and return a single boolean value. 653 // 654 // Schematically: 655 // 656 // Filter(Slice<t1, t2, ..., tn>, func(t1, t2, ..., tn) bool) Slice<t1, t2, ..., tn> 657 func Filter(slice Slice, pred interface{}, prags ...Pragma) Slice { 658 f := new(filterSlice) 659 f.name = MakeName("filter") 660 f.Slice = slice 661 f.Pragma = Pragmas(prags) 662 fn, ok := slicefunc.Of(pred) 663 if !ok { 664 typecheck.Panicf(1, "filter: invalid predicate function %T", pred) 665 } 666 if !typecheck.CanApply(fn, slice) { 667 typecheck.Panicf(1, "filter: function %T does not match input slice type %s", pred, slicetype.String(slice)) 668 } 669 if fn.Out.NumOut() != 1 || fn.Out.Out(0).Kind() != reflect.Bool { 670 typecheck.Panic(1, "filter: predicate must return a single boolean value") 671 } 672 f.pred = fn 673 return f 674 } 675 676 func (f *filterSlice) Name() Name { return f.name } 677 func (*filterSlice) NumDep() int { return 1 } 678 func (f *filterSlice) Dep(i int) Dep { return singleDep(i, f.Slice, false) } 679 func (*filterSlice) Combiner() slicefunc.Func { return slicefunc.Nil } 680 681 type filterReader struct { 682 op *filterSlice 683 reader sliceio.Reader 684 in frame.Frame 685 err error 686 } 687 688 func (f *filterReader) Read(ctx context.Context, out frame.Frame) (n int, err error) { 689 if f.err != nil { 690 return 0, f.err 691 } 692 if !slicetype.Assignable(out, f.op) { 693 return 0, errTypeError 694 } 695 var ( 696 m int 697 max = out.Len() 698 ) 699 args := make([]reflect.Value, out.NumOut()) 700 for m < max && f.err == nil { 701 // TODO(marius): this can get pretty inefficient when the accept 702 // rate is low: as we fill the output; we could degenerate into a 703 // case where we issue a call for each element. Consider input 704 // buffering instead. 705 if f.in.IsZero() { 706 f.in = frame.Make(f.op, max-m, max-m) 707 } else { 708 f.in = f.in.Ensure(max - m) 709 } 710 n, f.err = f.reader.Read(ctx, f.in) 711 for i := 0; i < n; i++ { 712 for j := range args { 713 args[j] = f.in.Value(j).Index(i) 714 } 715 if f.op.pred.Call(ctx, args)[0].Bool() { 716 frame.Copy(out.Slice(m, m+1), f.in.Slice(i, i+1)) 717 m++ 718 } 719 } 720 } 721 return m, f.err 722 } 723 724 func (f *filterSlice) Reader(shard int, deps []sliceio.Reader) sliceio.Reader { 725 return &filterReader{op: f, reader: deps[0]} 726 } 727 728 type flatmapSlice struct { 729 name Name 730 Pragma 731 Slice 732 fval slicefunc.Func 733 out slicetype.Type 734 } 735 736 // Flatmap returns a Slice that applies the function fn to each 737 // record in the slice, flattening the returned slice. That is, the 738 // function fn should be of the form: 739 // 740 // func(in1 inType1, in2 inType2, ...) (out1 []outType1, out2 []outType2) 741 // 742 // Schematically: 743 // 744 // Flatmap(Slice<t1, t2, ..., tn>, func(v1 t1, v2 t2, ..., vn tn) ([]r1, []r2, ..., []rn)) Slice<r1, r2, ..., rn> 745 func Flatmap(slice Slice, fn interface{}, prags ...Pragma) Slice { 746 f := new(flatmapSlice) 747 f.name = MakeName("flatmap") 748 f.Slice = slice 749 f.Pragma = Pragmas(prags) 750 sliceFn, ok := slicefunc.Of(fn) 751 if !ok { 752 typecheck.Panicf(1, "flatmap: invalid flatmap function %T", fn) 753 } 754 if !typecheck.CanApply(sliceFn, slice) { 755 typecheck.Panicf(1, "flatmap: flatmap function %T does not match input slice type %s", fn, slicetype.String(slice)) 756 } 757 f.out, ok = typecheck.Devectorize(sliceFn.Out) 758 if !ok { 759 typecheck.Panicf(1, "flatmap: flatmap function %T is not vectorized", fn) 760 } 761 f.fval = sliceFn 762 return f 763 } 764 765 func (f *flatmapSlice) Name() Name { return f.name } 766 func (f *flatmapSlice) NumOut() int { return f.out.NumOut() } 767 func (f *flatmapSlice) Out(c int) reflect.Type { return f.out.Out(c) } 768 func (*flatmapSlice) ShardType() ShardType { return HashShard } 769 func (*flatmapSlice) NumDep() int { return 1 } 770 func (f *flatmapSlice) Dep(i int) Dep { return singleDep(i, f.Slice, false) } 771 func (*flatmapSlice) Combiner() slicefunc.Func { return slicefunc.Nil } 772 773 type flatmapReader struct { 774 op *flatmapSlice 775 reader sliceio.Reader // underlying reader 776 777 in frame.Frame // buffer of inputs 778 begIn, endIn int 779 out frame.Frame // buffer of outputs 780 eof bool 781 } 782 783 func (f *flatmapReader) Read(ctx context.Context, out frame.Frame) (int, error) { 784 if !slicetype.Assignable(out, f.op) { 785 return 0, errTypeError 786 } 787 args := make([]reflect.Value, f.op.Slice.NumOut()) 788 begOut, endOut := 0, out.Len() 789 // Add buffered output from last call, if any. 790 if f.out.Len() > 0 { 791 n := frame.Copy(out, f.out) 792 begOut += n 793 f.out = f.out.Slice(n, f.out.Len()) 794 } 795 // Continue as long as we have (possibly buffered) input, and space 796 // for output. 797 for begOut < endOut && (!f.eof || f.begIn < f.endIn) { 798 if f.begIn == f.endIn { 799 // out[0].Len() may not be related to an actually useful size, but we'll go with it. 800 // TODO(marius): maybe always default to a fixed chunk size? Or 801 // dynamically keep track of the average input:output ratio? 802 if f.in.IsZero() { 803 f.in = frame.Make(f.op.Slice, out.Len(), out.Len()) 804 } else { 805 f.in = f.in.Ensure(out.Len()) 806 } 807 n, err := f.reader.Read(ctx, f.in) 808 if err != nil && err != sliceio.EOF { 809 return 0, err 810 } 811 f.begIn, f.endIn = 0, n 812 f.eof = err == sliceio.EOF 813 } 814 // Consume one input at a time, as long as we have space in our 815 // output buffer. 816 for ; f.begIn < f.endIn && begOut < endOut; f.begIn++ { 817 for j := range args { 818 args[j] = f.in.Index(j, f.begIn) 819 } 820 result := frame.Values(f.op.fval.Call(ctx, args)) 821 n := frame.Copy(out.Slice(begOut, endOut), result) 822 begOut += n 823 // We've run out of output space. In this case, stash the rest of 824 // our output into f.out, if any. 825 if m := result.Len(); n < m { 826 f.out = result.Slice(n, m) 827 } 828 } 829 } 830 var err error 831 // We're EOF if we've encountered an EOF from the underlying 832 // reader, there's no buffered output, and no buffered input. 833 if f.eof && f.out.Len() == 0 && f.begIn == f.endIn { 834 err = sliceio.EOF 835 } 836 return begOut, err 837 } 838 839 func (f *flatmapSlice) Reader(shard int, deps []sliceio.Reader) sliceio.Reader { 840 return &flatmapReader{op: f, reader: deps[0]} 841 } 842 843 type foldSlice struct { 844 name Name 845 Slice 846 fval slicefunc.Func 847 out slicetype.Type 848 dep Dep 849 } 850 851 // Fold returns a slice that aggregates values by the first column 852 // using a custom aggregation function. For an input slice 853 // Slice<t1, t2, ..., tn>, Fold requires that the provided accumulator 854 // function follow the form: 855 // 856 // func(accum acctype, v2 t2, ..., vn tn) acctype 857 // 858 // The function is invoked once for each slice element with the same 859 // value for column 1 (t1). On the first invocation, the accumulator 860 // is passed the zero value of its accumulator type. 861 // 862 // Fold requires that the first column of the slice is partitionable. 863 // See the documentation for Keyer for more details. 864 // 865 // Schematically: 866 // 867 // Fold(Slice<t1, t2, ..., tn>, func(accum acctype, v2 t2, ..., vn tn) acctype) Slice<t1, acctype> 868 // 869 // BUG(marius): Fold does not yet support slice grouping 870 func Fold(slice Slice, fold interface{}) Slice { 871 if n := slice.NumOut(); n < 2 { 872 typecheck.Panicf(1, "Fold can be applied only for slices with at least two columns; got %d", n) 873 } 874 if !frame.CanHash(slice.Out(0)) { 875 typecheck.Panicf(1, "fold: key type %s is not partitionable", slice.Out(0)) 876 } 877 if !canMakeAccumulatorForKey(slice.Out(0)) { 878 typecheck.Panicf(1, "fold: key type %s cannot be accumulated", slice.Out(0)) 879 } 880 f := new(foldSlice) 881 f.name = MakeName("fold") 882 f.Slice = slice 883 // Fold requires shuffle by the first column. 884 // TODO(marius): allow deps to express shuffling by other columns. 885 f.dep = Dep{slice, true, nil, false} 886 887 fn, ok := slicefunc.Of(fold) 888 if !ok { 889 typecheck.Panicf(1, "fold: invalid fold function %T", fold) 890 } 891 if fn.Out.NumOut() != 1 { 892 typecheck.Panicf(1, "fold: fold functions must return exactly one value") 893 } 894 // func(acc, t2, t3, ..., tn) 895 if got, want := fn.In, slicetype.Append(fn.Out, slicetype.Slice(slice, 1, slice.NumOut())); !typecheck.Equal(got, want) { 896 typecheck.Panicf(1, "fold: expected func(acc, t2, t3, ..., tn), got %T", fold) 897 } 898 f.fval = fn 899 // output: key, accumulator 900 f.out = slicetype.New(slice.Out(0), fn.Out.Out(0)) 901 return f 902 } 903 904 func (f *foldSlice) Name() Name { return f.name } 905 func (f *foldSlice) NumOut() int { return f.out.NumOut() } 906 func (f *foldSlice) Out(c int) reflect.Type { return f.out.Out(c) } 907 func (*foldSlice) NumDep() int { return 1 } 908 func (f *foldSlice) Dep(i int) Dep { return f.dep } 909 func (*foldSlice) Combiner() slicefunc.Func { return slicefunc.Nil } 910 911 type foldReader struct { 912 op *foldSlice 913 reader sliceio.Reader 914 accum Accumulator 915 err error 916 } 917 918 // Compute accumulates values across all keys in this shard. The entire 919 // output is buffered in memory. 920 func (f *foldReader) compute(ctx context.Context) (Accumulator, error) { 921 in := frame.Make(f.op.dep, defaultChunksize, defaultChunksize) 922 accum := makeAccumulator(f.op.dep.Out(0), f.op.out.Out(1), f.op.fval) 923 for { 924 n, err := f.reader.Read(ctx, in) 925 if err != nil && err != sliceio.EOF { 926 return nil, err 927 } 928 accum.Accumulate(in, n) 929 if err == sliceio.EOF { 930 return accum, nil 931 } 932 } 933 } 934 935 func (f *foldReader) Read(ctx context.Context, out frame.Frame) (int, error) { 936 if f.err != nil { 937 return 0, f.err 938 } 939 if !slicetype.Assignable(out, f.op) { 940 return 0, errTypeError 941 } 942 if f.accum == nil { 943 f.accum, f.err = f.compute(ctx) 944 if f.err != nil { 945 return 0, f.err 946 } 947 } 948 var n int 949 n, f.err = f.accum.Read(out.Value(0), out.Value(1)) 950 return n, f.err 951 } 952 953 func (f *foldSlice) Reader(shard int, deps []sliceio.Reader) sliceio.Reader { 954 return &foldReader{op: f, reader: deps[0]} 955 } 956 957 type headSlice struct { 958 name Name 959 Slice 960 n int 961 } 962 963 // Head returns a slice that returns at most the first n items from 964 // each shard of the underlying slice. Its type is the same as the 965 // provided slice. 966 func Head(slice Slice, n int) Slice { 967 return &headSlice{MakeName(fmt.Sprintf("head(%d)", n)), slice, n} 968 } 969 970 func (h *headSlice) Name() Name { return h.name } 971 func (*headSlice) NumDep() int { return 1 } 972 func (h *headSlice) Dep(i int) Dep { return singleDep(i, h.Slice, false) } 973 func (*headSlice) Combiner() slicefunc.Func { return slicefunc.Nil } 974 975 type headReader struct { 976 reader sliceio.Reader 977 n int 978 } 979 980 func (h headSlice) Reader(shard int, deps []sliceio.Reader) sliceio.Reader { 981 return &headReader{deps[0], h.n} 982 } 983 984 func (h *headReader) Read(ctx context.Context, out frame.Frame) (n int, err error) { 985 if h.n <= 0 { 986 return 0, sliceio.EOF 987 } 988 n, err = h.reader.Read(ctx, out) 989 h.n -= n 990 if h.n < 0 { 991 n -= -h.n 992 } 993 return 994 } 995 996 type scanSlice struct { 997 name Name 998 Slice 999 scan func(shard int, scanner *sliceio.Scanner) error 1000 } 1001 1002 // Scan invokes a function for each shard of the input Slice. 1003 // It returns a unit Slice: Scan is inteded to be used for its side 1004 // effects. 1005 func Scan(slice Slice, scan func(shard int, scanner *sliceio.Scanner) error) Slice { 1006 return &scanSlice{MakeName("scan"), slice, scan} 1007 } 1008 1009 func (s *scanSlice) Name() Name { return s.name } 1010 func (*scanSlice) NumOut() int { return 0 } 1011 func (*scanSlice) Out(c int) reflect.Type { panic(c) } 1012 func (*scanSlice) NumDep() int { return 1 } 1013 func (s *scanSlice) Dep(i int) Dep { return singleDep(i, s.Slice, false) } 1014 func (*scanSlice) Combiner() slicefunc.Func { return slicefunc.Nil } 1015 1016 type scanReader struct { 1017 slice scanSlice 1018 shard int 1019 reader sliceio.Reader 1020 } 1021 1022 func (s *scanReader) Read(ctx context.Context, out frame.Frame) (n int, err error) { 1023 err = s.slice.scan(s.shard, sliceio.NewScanner(s.slice.Slice, sliceio.NopCloser(s.reader))) 1024 if err == nil { 1025 err = sliceio.EOF 1026 } 1027 return 0, err 1028 } 1029 1030 func (s scanSlice) Reader(shard int, deps []sliceio.Reader) sliceio.Reader { 1031 return &scanReader{s, shard, deps[0]} 1032 } 1033 1034 type prefixSlice struct { 1035 Pragma 1036 Slice 1037 prefix int 1038 } 1039 1040 // Prefixed returns a slice with the provided prefix. A prefix determines 1041 // the number of columns (starting at 0) in the slice that compose the 1042 // key values for that slice for operations like reduce. For example, prefix of 2 1043 // means that columns 0 and 1 are the key. 1044 func Prefixed(slice Slice, prefix int) Slice { 1045 if prefix < 1 { 1046 typecheck.Panic(1, "prefixed: prefix must include at least one column") 1047 } 1048 if prefix > slice.NumOut() { 1049 typecheck.Panicf(1, "prefixed: prefix %d is greater than number of columns %d", prefix, slice.NumOut()) 1050 } 1051 var pragma Pragma = Pragmas{} 1052 if slicePragma, ok := slice.(Pragma); ok { 1053 pragma = slicePragma 1054 } 1055 return &prefixSlice{pragma, slice, prefix} 1056 } 1057 1058 func (p *prefixSlice) Prefix() int { return p.prefix } 1059 1060 // Unwrap returns the underlying slice if the provided slice is used 1061 // only to amend the type of the slice it composes. 1062 // 1063 // TODO(marius): this is required to properly compile slices that use the 1064 // prefix combinator; we should have a more general and robust solution 1065 // to this. 1066 func Unwrap(slice Slice) Slice { 1067 if slice, ok := slice.(*prefixSlice); ok { 1068 return Unwrap(slice.Slice) 1069 } 1070 return slice 1071 } 1072 1073 // String returns a string describing the slice and its type. 1074 func String(slice Slice) string { 1075 types := make([]string, slice.NumOut()) 1076 for i := range types { 1077 types[i] = fmt.Sprint(slice.Out(i)) 1078 } 1079 return fmt.Sprintf("%s<%s>", slice.Name().Op, strings.Join(types, ", ")) 1080 } 1081 1082 func singleDep(i int, slice Slice, shuffle bool) Dep { 1083 if i != 0 { 1084 panic(fmt.Sprintf("invalid dependency %d", i)) 1085 } 1086 return Dep{slice, shuffle, nil, false} 1087 } 1088 1089 var ( 1090 helperMu sync.Mutex 1091 helpers = make(map[string]bool) 1092 ) 1093 1094 // Helper is used to mark a function as a helper function: names for 1095 // newly created slices will be attributed to the caller of the 1096 // function instead of the function itself. 1097 func Helper() { 1098 helperMu.Lock() 1099 defer helperMu.Unlock() 1100 helpers[callerFunc(1)] = true 1101 } 1102 1103 func callerFunc(skip int) string { 1104 var pc [2]uintptr 1105 n := runtime.Callers(skip+2, pc[:]) // skip + runtime.Callers + callerFunc 1106 if n == 0 { 1107 panic("bigslice: zero callers found") 1108 } 1109 frames := runtime.CallersFrames(pc[:n]) 1110 frame, _ := frames.Next() 1111 return frame.Function 1112 } 1113 1114 // Name is a unique name for a slice, constructed with useful context for 1115 // diagnostic or status display. 1116 type Name struct { 1117 // Op is the operation that the slice performs (e.g. "reduce", "map") 1118 Op string 1119 // File is the file in which the slice was defined. 1120 File string 1121 // Line is the line in File at which the slice was defined. 1122 Line int 1123 // Index disambiguates slices created on the same File and Line. 1124 Index int 1125 } 1126 1127 func (n Name) String() string { 1128 return fmt.Sprintf("%s@%s:%d", n.Op, n.File, n.Line) 1129 } 1130 1131 func MakeName(op string) Name { 1132 // Presume the correct frame is the caller of makeName, 1133 // but skip to the frame before the last helper, if any. 1134 var pc [50]uintptr // consider at most 50 frames 1135 n := runtime.Callers(3, pc[:]) // caller of makeName, makeName, runtime.Callers. 1136 if n == 0 { 1137 panic("bigslice: no callers found") 1138 } 1139 frames := runtime.CallersFrames(pc[:n]) 1140 helperMu.Lock() 1141 var found runtime.Frame 1142 for more := true; more; { 1143 var frame runtime.Frame 1144 frame, more = frames.Next() 1145 if found.PC == 0 { 1146 found = frame 1147 } 1148 if helpers[frame.Function] { 1149 found = runtime.Frame{} 1150 } 1151 } 1152 helperMu.Unlock() 1153 index := newNameIndex(op, found.File, found.Line) 1154 return Name{op, found.File, found.Line, index} 1155 } 1156 1157 type sliceNameIndexerKey struct { 1158 op string 1159 file string 1160 line int 1161 } 1162 1163 var sliceNameIndexerMu sync.Mutex 1164 var sliceNameIndexerMap = make(map[sliceNameIndexerKey]int) 1165 1166 func newNameIndex(op, file string, line int) int { 1167 key := sliceNameIndexerKey{op, file, line} 1168 sliceNameIndexerMu.Lock() 1169 defer sliceNameIndexerMu.Unlock() 1170 c := sliceNameIndexerMap[key] 1171 sliceNameIndexerMap[key]++ 1172 return c 1173 }