github.com/grailbio/bigslice@v0.0.0-20230519005545-30c4c12152ad/slice_test.go (about) 1 // Copyright 2018 GRAIL, Inc. All rights reserved. 2 // Use of this source code is governed by the Apache 2.0 3 // license that can be found in the LICENSE file. 4 5 package bigslice_test 6 7 import ( 8 "bufio" 9 "bytes" 10 "context" 11 "fmt" 12 "io/ioutil" 13 "math/rand" 14 "os" 15 "reflect" 16 "runtime" 17 "sort" 18 "strings" 19 "sync" 20 "testing" 21 "testing/quick" 22 "text/tabwriter" 23 24 fuzz "github.com/google/gofuzz" 25 "github.com/grailbio/base/errors" 26 "github.com/grailbio/base/log" 27 "github.com/grailbio/bigmachine/rpc" 28 "github.com/grailbio/bigmachine/testsystem" 29 "github.com/grailbio/bigslice" 30 "github.com/grailbio/bigslice/exec" 31 "github.com/grailbio/bigslice/metrics" 32 "github.com/grailbio/bigslice/sliceio" 33 "github.com/grailbio/bigslice/slicetest" 34 "github.com/grailbio/bigslice/typecheck" 35 ) 36 37 func init() { 38 log.AddFlags() // so they can be used in tests 39 } 40 41 func sortColumns(columns []reflect.Value) { 42 s := new(columnSlice) 43 s.keys = columns[0].Interface().([]string) 44 s.swappers = make([]func(i, j int), len(columns)) 45 for i := range columns { 46 s.swappers[i] = reflect.Swapper(columns[i].Interface()) 47 } 48 sort.Stable(s) 49 } 50 51 type columnSlice struct { 52 keys []string 53 swappers []func(i, j int) 54 } 55 56 func (c columnSlice) Len() int { return len(c.keys) } 57 func (c columnSlice) Less(i, j int) bool { return c.keys[i] < c.keys[j] } 58 func (c columnSlice) Swap(i, j int) { 59 for _, swap := range c.swappers { 60 swap(i, j) 61 } 62 } 63 64 var executors = map[string]exec.Option{ 65 "Local": exec.Local, 66 "Bigmachine.Test": exec.Bigmachine(testsystem.New()), 67 } 68 69 func run(ctx context.Context, t *testing.T, slice bigslice.Slice) map[string]*sliceio.Scanner { 70 t.Helper() 71 scannerErrs := runError(ctx, t, slice) 72 scanners := make(map[string]*sliceio.Scanner, len(scannerErrs)) 73 for name, scannerErr := range scannerErrs { 74 if err := scannerErr.Err; err != nil { 75 t.Errorf("executor %s error %v", name, err) 76 } else { 77 scanners[name] = scannerErr.Scanner 78 } 79 } 80 return scanners 81 } 82 83 type scannerErr struct { 84 *sliceio.Scanner 85 Err error 86 } 87 88 func runError(ctx context.Context, t *testing.T, slice bigslice.Slice) map[string]scannerErr { 89 t.Helper() 90 results := make(map[string]scannerErr) 91 fn := bigslice.Func(func() bigslice.Slice { return slice }) 92 for name, opt := range executors { 93 if testing.Short() && name != "Local" { 94 continue 95 } 96 sess := exec.Start(opt) 97 // TODO(marius): faster teardown in bigmachine so that we can call this here. 98 // defer sess.Shutdown() 99 res, err := sess.Run(ctx, fn) 100 results[name] = scannerErr{res.Scanner(), err} 101 } 102 return results 103 } 104 105 func assertColumnsEqual(t *testing.T, sort bool, columns ...interface{}) { 106 t.Helper() 107 if len(columns)%2 != 0 { 108 t.Fatal("must pass even number of columns") 109 } 110 numColumns := len(columns) / 2 111 if numColumns < 1 { 112 t.Fatal("must have at least one column to compare") 113 } 114 gotCols := make([]reflect.Value, numColumns) 115 wantCols := make([]reflect.Value, numColumns) 116 for i := range columns { 117 j := i / 2 118 if i%2 == 0 { 119 gotCols[j] = reflect.ValueOf(columns[i]) 120 if gotCols[j].Kind() != reflect.Slice { 121 t.Errorf("column %d of actual must be a slice", j) 122 return 123 } 124 if j > 0 && gotCols[j].Len() != gotCols[j-1].Len() { 125 t.Errorf("got %d, want %d columns in actual", gotCols[j].Len(), gotCols[j-1].Len()) 126 return 127 } 128 } else { 129 // Problems with our expected columns are fatal, as that means that 130 // the test itself is incorrectly constructed. 131 wantCols[j] = reflect.ValueOf(columns[i]) 132 if wantCols[j].Kind() != reflect.Slice { 133 t.Fatalf("column %d of expected must be a slice", j) 134 } 135 if j > 0 && wantCols[j].Len() != wantCols[j-1].Len() { 136 t.Fatalf("got %d, want %d columns in expected", wantCols[j].Len(), wantCols[j-1].Len()) 137 } 138 } 139 } 140 if sort { 141 sortColumns(gotCols) 142 sortColumns(wantCols) 143 } 144 145 switch got, want := gotCols[0].Len(), wantCols[0].Len(); { 146 case got == want: 147 case got < want: 148 t.Errorf("short result: got %v, want %v", got, want) 149 return 150 case want < got: 151 row := make([]string, len(gotCols)) 152 for i := range row { 153 row[i] = fmt.Sprint(gotCols[i].Index(want).Interface()) 154 } 155 // Show one row of extra values to help debug. 156 t.Errorf("extra values: %v", strings.Join(row, ",")) 157 } 158 159 // wantCols[0].Len() <= gotCols[0].Len() so we compare wantCols[0].Len() 160 // rows. 161 numRows := wantCols[0].Len() 162 got := make([]interface{}, numColumns) 163 want := make([]interface{}, numColumns) 164 for i := 0; i < numColumns; i++ { 165 got[i] = gotCols[i].Interface() 166 want[i] = wantCols[i].Interface() 167 } 168 169 if !reflect.DeepEqual(got, want) { 170 // Print full rows for small results. They are easier to interpret 171 // than diffs. 172 if numRows < 10 && numColumns < 10 { 173 var ( 174 gotRows = make([]string, numRows) 175 wantRows = make([]string, numRows) 176 ) 177 for i := range gotRows { 178 var ( 179 got = make([]string, numColumns) 180 want = make([]string, numColumns) 181 ) 182 for j := range got { 183 got[j] = fmt.Sprint(gotCols[j].Index(i).Interface()) 184 want[j] = fmt.Sprint(wantCols[j].Index(i).Interface()) 185 } 186 gotRows[i] = strings.Join(got, " ") 187 wantRows[i] = strings.Join(want, " ") 188 } 189 t.Errorf("result mismatch:\ngot:\n%s\nwant:\n%s", strings.Join(gotRows, "\n"), strings.Join(wantRows, "\n")) 190 return 191 } 192 193 // Print as columns 194 var b bytes.Buffer 195 var tw tabwriter.Writer 196 tw.Init(&b, 4, 4, 1, ' ', 0) 197 for i := 0; i < numRows; i++ { 198 var diff bool 199 row := make([]string, numColumns) 200 for j := range row { 201 got := gotCols[j].Index(i).Interface() 202 want := wantCols[j].Index(i).Interface() 203 if !reflect.DeepEqual(got, want) { 204 diff = true 205 row[j] = fmt.Sprintf("%v->%v", want, got) 206 } else { 207 row[j] = fmt.Sprint(got) 208 } 209 } 210 if diff { 211 fmt.Fprintf(&tw, "[%d] %s\n", i, strings.Join(row, "\t")) 212 } 213 } 214 tw.Flush() 215 t.Errorf("result mismatch:\n%s", b.String()) 216 } 217 } 218 219 func assertEqual(t *testing.T, slice bigslice.Slice, sort bool, expect ...interface{}) { 220 if !testing.Short() { 221 rpc.InjectFailures = true 222 defer func() { rpc.InjectFailures = false }() 223 } 224 225 t.Helper() 226 for name, s := range run(context.Background(), t, slice) { 227 t.Run(name, func(t *testing.T) { 228 defer s.Close() 229 args := make([]interface{}, len(expect)) 230 for i := range args { 231 // Make this one larger to make sure we exhaust the scanner. 232 v := reflect.ValueOf(expect[i]) 233 slice := reflect.MakeSlice(v.Type(), v.Len()+1, v.Len()+1) 234 args[i] = slice.Interface() 235 } 236 n, ok := s.Scanv(context.Background(), args...) 237 if ok { 238 t.Errorf("%s: long read (%d)", name, n) 239 } 240 if err := s.Err(); err != nil { 241 t.Errorf("%s: %v", name, err) 242 return 243 } 244 for i := range args { 245 args[i] = reflect.ValueOf(args[i]).Slice(0, n).Interface() 246 } 247 columns := make([]interface{}, len(expect)*2) 248 for i := range expect { 249 columns[i*2] = args[i] 250 columns[i*2+1] = expect[i] 251 } 252 assertColumnsEqual(t, sort, columns...) 253 }) 254 } 255 } 256 257 func expectTypeError(t *testing.T, message string, fn func()) { 258 t.Helper() 259 typecheck.TestCalldepth = 2 260 _, file, line, ok := runtime.Caller(1) 261 if !ok { 262 t.Fatal("runtime.Caller error") 263 } 264 defer func() { 265 t.Helper() 266 typecheck.TestCalldepth = 0 267 e := recover() 268 if e == nil { 269 t.Fatal("expected error") 270 } 271 err, ok := e.(*typecheck.Error) 272 if !ok { 273 t.Fatalf("expected typeError, got %T", e) 274 } 275 if got, want := err.File, file; got != want { 276 t.Errorf("got %v, want %v", got, want) 277 } 278 if got, want := err.Line, line; got != want { 279 t.Errorf("got %v, want %v", got, want) 280 } 281 if got, want := err.Err.Error(), message; got != want { 282 t.Errorf("got %q, want %q", got, want) 283 } 284 }() 285 fn() 286 } 287 288 type genNshard int 289 290 func (genNshard) Generate(rand *rand.Rand, size int) reflect.Value { 291 // The number of shards must be >= 1 (guaranteed by constSlice 292 // construction). 293 return reflect.ValueOf(genNshard(rand.Intn(size) + 1)) 294 } 295 296 type genNrow int 297 298 func (genNrow) Generate(rand *rand.Rand, size int) reflect.Value { 299 return reflect.ValueOf(genNrow(rand.Intn(size))) 300 } 301 302 // TestConstShard verifies that the algorithm used to shard const slice data 303 // behaves properly. The algorithm must provide shards that: cover the entire 304 // data set, are within the bounds of data, are mutually exclusive, and are 305 // distributed evenly. 306 func TestConstShard(t *testing.T) { 307 f := func(gN genNrow, gNshard genNshard) bool { 308 var ( 309 n = int(gN) 310 nshard = int(gNshard) 311 covered = make([]bool, n) 312 offsets = make([]int, nshard) 313 counts = make([]int, nshard) 314 ) 315 for shard := 0; shard < nshard; shard++ { 316 offset, count := bigslice.ConstShard(n, nshard, shard) 317 for i := offset; i < offset+count; i++ { 318 if i < 0 || i >= n { 319 // Out of bounds of data. 320 return false 321 } 322 if covered[i] { 323 // Already covered by another shard. 324 return false 325 } 326 covered[i] = true 327 } 328 offsets[shard] = offset 329 offsets[shard] = count 330 } 331 for _, c := range covered { 332 if !c { 333 // Data element that was part of no shard. 334 return false 335 } 336 } 337 if nshard == 0 { 338 return true 339 } 340 var ( 341 minCount = counts[0] 342 maxCount = counts[0] 343 ) 344 for _, c := range counts { 345 if c < minCount { 346 minCount = c 347 } 348 if c > maxCount { 349 maxCount = c 350 } 351 } 352 // Check even distribution. 353 return maxCount-minCount <= 1 354 } 355 // Some known edge cases. 356 for _, c := range []struct { 357 n int 358 nshard int 359 }{ 360 {0, 0}, 361 {10, 10}, // Equal rows and shards. 362 {30, 10}, // Even multiple of shards. 363 {1, 10}, // More shards than elements. 364 } { 365 name := fmt.Sprintf("n:%d nshard:%d", c.n, c.nshard) 366 t.Run(name, func(t *testing.T) { 367 if !f(genNrow(c.n), genNshard(c.nshard)) { 368 t.Errorf("misbehaves") 369 } 370 }) 371 } 372 // Random cases. 373 t.Run("Quick", func(t *testing.T) { 374 if err := quick.Check(f, nil); err != nil { 375 t.Error(err) 376 } 377 }) 378 } 379 380 func TestConst(t *testing.T) { 381 const N = 10000 382 fz := fuzz.New() 383 fz.NilChance(0) 384 fz.NumElements(N, N) 385 var ( 386 col1 []string 387 col2 []int 388 ) 389 fz.Fuzz(&col1) 390 fz.Fuzz(&col2) 391 for nshards := 1; nshards < 20; nshards++ { 392 slice := bigslice.Const(nshards, col1, col2) 393 assertEqual(t, slice, true, col1, col2) 394 } 395 } 396 397 func TestConstError(t *testing.T) { 398 expectTypeError(t, "const: invalid slice inputs", func() { bigslice.Const(1, 123) }) 399 } 400 401 func TestReaderFunc(t *testing.T) { 402 const ( 403 N = 10000 404 Nshard = 10 405 ) 406 type state struct { 407 *fuzz.Fuzzer 408 total int 409 } 410 slice := bigslice.ReaderFunc(Nshard, func(shard int, state *state, strings []string, ints []int) (n int, err error) { 411 // The input should be zerod by bigslice. 412 var nnonzero int 413 for i := range strings { 414 if strings[i] != "" || ints[i] != 0 { 415 nnonzero++ 416 } 417 } 418 if nnonzero > 0 { 419 t.Errorf("%d (of %d) nonzero rows", nnonzero, len(strings)) 420 } 421 if state.Fuzzer == nil { 422 state.Fuzzer = fuzz.New() 423 } 424 state.NumElements(1, len(strings)) 425 var ( 426 fstrings []string 427 fints []int 428 ) 429 state.Fuzz(&fstrings) 430 state.Fuzz(&fints) 431 n = copy(strings, fstrings) 432 m := copy(ints, fints) 433 if m < n { 434 n = m 435 } 436 state.total += n 437 if state.total >= N { 438 return n - (state.total - N), sliceio.EOF 439 } 440 return n, nil 441 }) 442 // Map everything to the same key so we can count them. 443 slice = bigslice.Map(slice, func(s string, i int) (key string, count int) { return "", 1 }) 444 slice = bigslice.Fold(slice, func(a, e int) int { return a + e }) 445 assertEqual(t, slice, false, []string{""}, []int{N * Nshard}) 446 } 447 448 func TestReaderFuncError(t *testing.T) { 449 expectTypeError(t, "readerfunc: invalid reader function type func()", func() { bigslice.ReaderFunc(1, func() {}) }) 450 expectTypeError(t, "readerfunc: invalid reader function type string", func() { bigslice.ReaderFunc(1, "invalid") }) 451 expectTypeError(t, "readerfunc: invalid reader function type func(string, string, []int) (int, error)", func() { bigslice.ReaderFunc(1, func(shard string, state string, x []int) (int, error) { panic("") }) }) 452 expectTypeError(t, "readerfunc: function func(int, string, []int) error does not return (int, error)", func() { bigslice.ReaderFunc(1, func(shard int, state string, x []int) error { panic("") }) }) 453 expectTypeError(t, "readerfunc: invalid reader function type func(int, string) (int, error)", func() { bigslice.ReaderFunc(1, func(shard int, state string) (int, error) { panic("") }) }) 454 } 455 456 const readerFuncForgetEOFMessage = "warning: reader func returned empty vector" 457 458 // TestReaderFuncForgetEOF runs a buggy ReaderFunc that never returns sliceio.EOF. We check that 459 // bigslice prints a warning. 460 func TestReaderFuncForgetEOF(t *testing.T) { 461 var logOut bytes.Buffer 462 log.SetOutput(&logOut) 463 const N = 500 464 slice := bigslice.ReaderFunc(1, func(_ int, state *int, _ []int) (int, error) { 465 // Simulate an empty input. Users should return sliceio.EOF immediately, but some forget 466 // and just return nil. Eventually return EOF so the test terminates. 467 if *state >= N { 468 return 0, sliceio.EOF 469 } 470 *state++ 471 return 0, nil 472 }) 473 assertEqual(t, slice, false, []int{}) 474 if !strings.Contains(logOut.String(), readerFuncForgetEOFMessage) { 475 t.Errorf("expected empty vector log message, got: %q", logOut.String()) 476 } 477 } 478 479 // TestReaderFuncNoForgetEOF complements TestReaderFuncForgetEOF, testing that no spurious log 480 // messages are written if reader funcs return non-empty vectors. 481 func TestReaderFuncNoForgetEOF(t *testing.T) { 482 var logOut bytes.Buffer 483 log.SetOutput(&logOut) 484 const N = 500 485 slice := bigslice.ReaderFunc(1, func(_ int, state *int, out []int) (int, error) { 486 // Simulate an empty input. Users should return sliceio.EOF immediately, but some forget 487 // and just return nil. Eventually return EOF so the test terminates. 488 if *state >= N { 489 return 0, sliceio.EOF 490 } 491 *state++ 492 return 1, nil 493 }) 494 assertEqual(t, slice, false, make([]int, N)) 495 if strings.Contains(logOut.String(), readerFuncForgetEOFMessage) { 496 t.Errorf("expected no empty vector log message, got: %q", logOut.String()) 497 } 498 } 499 500 // TestWriterFunc tests the basic functionality of WriterFunc, verifying that 501 // all data is passed to the write function, and all data is available in the 502 // resulting slice. 503 func TestWriterFunc(t *testing.T) { 504 const ( 505 N = 10000 506 Nshard = 10 507 ) 508 fz := fuzz.New() 509 fz.NilChance(0) 510 fz.NumElements(N, N) 511 var ( 512 col1 []string 513 col2 []int 514 ) 515 fz.Fuzz(&col1) 516 fz.Fuzz(&col2) 517 518 slice := bigslice.Const(Nshard, col1, col2) 519 520 type state struct { 521 col1 []string 522 col2 []int 523 errs []error 524 } 525 var ( 526 writerMutex sync.Mutex 527 // The states of the writers, by shard. 528 writerStates []state 529 ) 530 slice = bigslice.WriterFunc(slice, 531 func(shard int, state *state, err error, col1 []string, col2 []int) error { 532 state.col1 = append(state.col1, col1...) 533 state.col2 = append(state.col2, col2...) 534 state.errs = append(state.errs, err) 535 if err != nil { 536 writerMutex.Lock() 537 defer writerMutex.Unlock() 538 writerStates[shard] = *state 539 } 540 return nil 541 }) 542 543 // We expect both the columns written by the writer func and the columns in 544 // the resulting slice to match the input. We make a copy to avoid 545 // disturbing the inputs, as we'll end up sorting these to compare them. 546 wantCol1 := append([]string{}, col1...) 547 wantCol2 := append([]int{}, col2...) 548 549 ctx := context.Background() 550 fn := bigslice.Func(func() bigslice.Slice { return slice }) 551 for name, opt := range executors { 552 t.Run(name, func(t *testing.T) { 553 // Each execution starts with a fresh state for the writer. 554 writerStates = make([]state, Nshard) 555 sess := exec.Start(opt) 556 res, err := sess.Run(ctx, fn) 557 if err != nil { 558 t.Errorf("executor %s error %v", name, err) 559 return 560 } 561 562 // Check the columns in the output slice. 563 scanner := res.Scanner() 564 defer scanner.Close() 565 var ( 566 s string 567 i int 568 resCol1 []string 569 resCol2 []int 570 ) 571 for scanner.Scan(context.Background(), &s, &i) { 572 resCol1 = append(resCol1, s) 573 resCol2 = append(resCol2, i) 574 } 575 assertColumnsEqual(t, true, resCol1, wantCol1, resCol2, wantCol2) 576 577 // Check the columns written by the writer func. 578 var ( 579 writerCol1 []string 580 writerCol2 []int 581 ) 582 for _, state := range writerStates { 583 writerCol1 = append(writerCol1, state.col1...) 584 writerCol2 = append(writerCol2, state.col2...) 585 } 586 assertColumnsEqual(t, true, writerCol1, wantCol1, writerCol2, wantCol2) 587 588 // Check that errors were passed as expected to the writer func. 589 for shard, state := range writerStates { 590 if len(state.errs) < 1 { 591 t.Errorf("writer for shard %d did not get EOF", shard) 592 continue 593 } 594 for i := 0; i < len(state.errs)-1; i++ { 595 if state.errs[i] != nil { 596 // Only the last error received should be non-nil. 597 t.Errorf("got premature error") 598 break 599 } 600 } 601 if got, want := state.errs[len(state.errs)-1], sliceio.EOF; got != want { 602 t.Errorf("got %v, want %v", got, want) 603 } 604 } 605 }) 606 } 607 } 608 609 // TestWriterFuncBadFunc tests the type-checking of the writer func passed to 610 // WriterFunc. 611 func TestWriterFuncBadFunc(t *testing.T) { 612 for _, c := range []struct { 613 name string 614 message string 615 f interface{} 616 }{ 617 { 618 "String", 619 "writerfunc: invalid writer function type string; must be func(shard int, state stateType, err error, col1 []string, col2 []int) error", 620 "I'm not a function at all", 621 }, 622 { 623 "NoArguments", 624 "writerfunc: invalid writer function type func(); must be func(shard int, state stateType, err error, col1 []string, col2 []int) error", 625 func() {}, 626 }, 627 { 628 "NonSliceColumn", 629 "writerfunc: invalid writer function type func(int, int, error, string, []int) error; must be func(shard int, state stateType, err error, col1 []string, col2 []int) error", 630 func(shard int, state int, err error, col1 string, col2 []int) error { panic("") }, 631 }, 632 { 633 "NotEnoughColumns", 634 "writerfunc: invalid writer function type func(int, int, error, []string) error; must be func(shard int, state stateType, err error, col1 []string, col2 []int) error", 635 func(shard int, state int, err error, col1 []string) error { panic("") }, 636 }, 637 { 638 "TooManyColumns", 639 "writerfunc: invalid writer function type func(int, int, error, []string, []int, []int) error; must be func(shard int, state stateType, err error, col1 []string, col2 []int) error", 640 func(shard int, state int, err error, col1 []string, col2 []int, col3 []int) error { panic("") }, 641 }, 642 { 643 "StringShard", 644 "writerfunc: invalid writer function type func(string, int, error, []string, []int) error; must be func(shard int, state stateType, err error, col1 []string, col2 []int) error", 645 func(shard string, state int, err error, col1 []string, col2 []int) error { panic("") }, 646 }, 647 { 648 "WrongColumnElementType", 649 "writerfunc: invalid writer function type func(int, int, error, []string, []string) error; must be func(shard int, state stateType, err error, col1 []string, col2 []int) error", 650 func(shard int, state int, err error, col1 []string, col2 []string) error { panic("") }, 651 }, 652 { 653 "NoReturn", 654 "writerfunc: invalid writer function type func(int, int, error, []string, []int); must return error", 655 func(shard int, state int, err error, col1 []string, col2 []int) { panic("") }, 656 }, 657 { 658 "ReturnInt", 659 "writerfunc: invalid writer function type func(int, int, error, []string, []int) int; must return error", 660 func(shard int, state int, err error, col1 []string, col2 []int) int { panic("") }, 661 }, 662 } { 663 t.Run(c.name, func(t *testing.T) { 664 slice := bigslice.Const(1, []string{}, []int{}) 665 expectTypeError(t, c.message, func() { bigslice.WriterFunc(slice, c.f) }) 666 }) 667 } 668 } 669 670 // TestWriterFuncError tests the behavior of WriterFunc under various error 671 // conditions. 672 func TestWriterFuncError(t *testing.T) { 673 assertWriterErr := func(t *testing.T, slice bigslice.Slice) { 674 fn := bigslice.Func(func() bigslice.Slice { return slice }) 675 for name, opt := range executors { 676 t.Run(name, func(t *testing.T) { 677 sess := exec.Start(opt) 678 _, err := sess.Run(context.Background(), fn) 679 if err == nil { 680 t.Errorf("expected error") 681 } else { 682 if got, want := err.Error(), "writerError"; !strings.Contains(got, want) { 683 t.Errorf("got %v, want %v", got, want) 684 } 685 } 686 }) 687 } 688 } 689 690 // The write function always returns an error, so we should see it. 691 t.Run("WriteAlwaysErr", func(t *testing.T) { 692 slice := bigslice.Const(2, []string{"a", "b", "c", "d"}) 693 slice = bigslice.WriterFunc(slice, func(shard int, state int, err error, col1 []string) error { 694 return errors.New("writerError") 695 }) 696 assertWriterErr(t, slice) 697 }) 698 699 // The write function returns an error when it sees the EOF. We expect to 700 // see the returned error, even though the underlying read succeeded 701 // without error. 702 t.Run("WriteErrOnEOF", func(t *testing.T) { 703 slice := bigslice.Const(2, []string{"a", "b", "c", "d"}) 704 slice = bigslice.WriterFunc(slice, func(shard int, state int, err error, col1 []string) error { 705 if err == sliceio.EOF { 706 return errors.New("writerError") 707 } 708 return nil 709 }) 710 assertWriterErr(t, slice) 711 }) 712 } 713 714 func TestMap(t *testing.T) { 715 const N = 100000 716 input := make([]int, N) 717 output := make([]string, N) 718 for i := range input { 719 input[i] = i 720 output[i] = fmt.Sprint(i) 721 } 722 slice := bigslice.Const(1, input) 723 slice = bigslice.Map(slice, func(i int) string { return fmt.Sprint(i) }) 724 assertEqual(t, slice, false, output) 725 } 726 727 func TestMapError(t *testing.T) { 728 input := bigslice.Const(1, []string{"x", "y"}) 729 expectTypeError(t, "map: invalid map function int", func() { bigslice.Map(input, 123) }) 730 expectTypeError(t, "map: function func(int) string does not match input slice type slice[1]string", func() { bigslice.Map(input, func(x int) string { return "" }) }) 731 expectTypeError(t, "map: function func(int, int) string does not match input slice type slice[1]string", func() { bigslice.Map(input, func(x, y int) string { return "" }) }) 732 expectTypeError(t, "map: need at least one output column", func() { bigslice.Map(input, func(x string) {}) }) 733 } 734 735 func TestFilter(t *testing.T) { 736 const N = 100000 737 input := make([]int, N) 738 output := make([]int, N/2) 739 for i := range input { 740 input[i] = i 741 if i%2 == 0 { 742 output[i/2] = i 743 } 744 } 745 slice := bigslice.Const(N/1000, input) 746 slice = bigslice.Filter(slice, func(i int) bool { return i%2 == 0 }) 747 assertEqual(t, slice, false, output) 748 749 slice = bigslice.Const(1, input) 750 slice = bigslice.Filter(slice, func(i int) bool { return false }) 751 assertEqual(t, slice, false, []int{}) 752 753 slice = bigslice.Const(1, input) 754 slice = bigslice.Filter(slice, func(i int) bool { 755 switch i { 756 case N / 4, N / 2, 3 * N / 4: 757 return true 758 default: 759 return false 760 } 761 }) 762 assertEqual(t, slice, false, []int{N / 4, N / 2, 3 * N / 4}) 763 } 764 765 func TestFilterError(t *testing.T) { 766 input := bigslice.Const(1, []string{"x", "y"}) 767 expectTypeError(t, "filter: invalid predicate function int", func() { bigslice.Filter(input, 123) }) 768 expectTypeError(t, "filter: function func(int) bool does not match input slice type slice[1]string", func() { bigslice.Filter(input, func(x int) bool { return false }) }) 769 expectTypeError(t, "filter: function func(int, int) string does not match input slice type slice[1]string", func() { bigslice.Filter(input, func(x, y int) string { return "" }) }) 770 expectTypeError(t, "filter: predicate must return a single boolean value", func() { bigslice.Filter(input, func(x string) {}) }) 771 expectTypeError(t, "filter: predicate must return a single boolean value", func() { bigslice.Filter(input, func(x string) int { return 0 }) }) 772 expectTypeError(t, "filter: predicate must return a single boolean value", func() { bigslice.Filter(input, func(x string) (bool, int) { return false, 0 }) }) 773 } 774 775 func TestFlatmap(t *testing.T) { 776 slice := bigslice.Const(2, []string{"x,x", "y,y,y", "z", "", "x"}) 777 slice = bigslice.Flatmap(slice, func(s string) []string { 778 if s == "" { 779 return nil 780 } 781 return strings.Split(s, ",") 782 }) 783 assertEqual(t, slice, true, []string{"x", "x", "x", "y", "y", "y", "z"}) 784 785 // Multiple columns 786 slice = bigslice.Flatmap(slice, func(s string) ([]string, []int) { 787 return []string{s}, []int{len(s)} 788 }) 789 assertEqual(t, slice, true, 790 []string{"x", "x", "x", "y", "y", "y", "z"}, 791 []int{1, 1, 1, 1, 1, 1, 1}, 792 ) 793 794 // Filter everything 795 slice = bigslice.Flatmap(slice, func(s string, i int) []string { 796 return nil 797 }) 798 assertEqual(t, slice, true, []string{}) 799 800 // Partial filter 801 slice = bigslice.Const(1, []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}) 802 slice = bigslice.Flatmap(slice, func(i int) []int { 803 if i%2 == 0 { 804 return []int{i} 805 } 806 return nil 807 }) 808 assertEqual(t, slice, false, []int{0, 2, 4, 6, 8, 10}) 809 810 // Large slices 811 input := make([]string, 1024*10) 812 for i := range input { 813 input[i] = fmt.Sprint(i) 814 } 815 slice = bigslice.Const(5, input) 816 slice = bigslice.Flatmap(slice, func(s string) []string { 817 switch s { 818 case "1024": 819 return []string{s} 820 case "5000": 821 return []string{s} 822 default: 823 return nil 824 } 825 }) 826 assertEqual(t, slice, true, []string{"1024", "5000"}) 827 } 828 829 func TestFlatmapBuffered(t *testing.T) { 830 zeros := make([]int, 1025) 831 slice := bigslice.Const(1, []int{0}) 832 slice = bigslice.Flatmap(slice, func(i int) []int { 833 return zeros 834 }) 835 // Drive it manually: 836 assertEqual(t, slice, false, zeros) 837 } 838 839 func TestFlatmapError(t *testing.T) { 840 input := bigslice.Const(1, []int{1, 2, 3}) 841 expectTypeError(t, "flatmap: invalid flatmap function int", func() { bigslice.Flatmap(input, 123) }) 842 expectTypeError(t, "flatmap: flatmap function func(string) []int does not match input slice type slice[1]int", func() { bigslice.Flatmap(input, func(s string) []int { return nil }) }) 843 expectTypeError(t, "flatmap: flatmap function func(int) int is not vectorized", func() { bigslice.Flatmap(input, func(i int) int { return 0 }) }) 844 expectTypeError(t, "flatmap: flatmap function func(int, int) []int does not match input slice type slice[1]int", func() { bigslice.Flatmap(input, func(i, j int) []int { return nil }) }) 845 846 } 847 848 func TestFold(t *testing.T) { 849 const N = 10000 850 fz := fuzz.New() 851 fz.NilChance(0) 852 fz.NumElements(N/2, N/2) 853 var ( 854 keys []string 855 values []int 856 ) 857 fz.Fuzz(&keys) 858 fz.Fuzz(&values) 859 keys = append(keys, keys...) 860 values = append(values, values...) 861 slice := bigslice.Const(N/1000, keys, values) 862 slice = bigslice.Fold(slice, func(a, e int) int { return a + e }) 863 864 expect := make(map[string]int) 865 for i, key := range keys { 866 expect[key] += values[i] 867 } 868 var ( 869 expectKeys []string 870 expectValues []int 871 ) 872 for key, value := range expect { 873 expectKeys = append(expectKeys, key) 874 expectValues = append(expectValues, value) 875 } 876 assertEqual(t, slice, true, expectKeys, expectValues) 877 878 // Make sure we can partition other element types also. 879 slice = bigslice.Const(N/1000, values, keys) 880 slice = bigslice.Fold(slice, func(a int, e string) int { return a + len(e) }) 881 slice = bigslice.Map(slice, func(key, count int) (int, int) { return 0, count }) 882 slice = bigslice.Fold(slice, func(a, e int) int { return a + e }) 883 var totalSize int 884 for _, key := range keys { 885 totalSize += len(key) 886 } 887 assertEqual(t, slice, false, []int{0}, []int{totalSize}) 888 } 889 890 func TestFoldError(t *testing.T) { 891 input := bigslice.Const(1, []int{1, 2, 3}) 892 floatInput := bigslice.Map(input, func(x int) (float64, int) { return 0, 0 }) 893 intInput := bigslice.Map(input, func(x int) (int, int) { return 0, 0 }) 894 expectTypeError(t, "fold: key type float64 cannot be accumulated", func() { bigslice.Fold(floatInput, func(x int) int { return 0 }) }) 895 expectTypeError(t, "Fold can be applied only for slices with at least two columns; got 1", func() { bigslice.Fold(input, func(x int) int { return 0 }) }) 896 expectTypeError(t, "fold: expected func(acc, t2, t3, ..., tn), got func(int) int", func() { bigslice.Fold(intInput, func(x int) int { return 0 }) }) 897 expectTypeError(t, "fold: expected func(acc, t2, t3, ..., tn), got func(int, int) string", func() { bigslice.Fold(intInput, func(a, x int) string { return "" }) }) 898 expectTypeError(t, "fold: fold functions must return exactly one value", func() { bigslice.Fold(intInput, func(a, x int) (int, int) { return 0, 0 }) }) 899 expectTypeError(t, "fold: expected func(acc, t2, t3, ..., tn), got func(int, string) int", func() { bigslice.Fold(intInput, func(a int, x string) int { return 0 }) }) 900 } 901 902 func TestHead(t *testing.T) { 903 slice := bigslice.Head(bigslice.Const(2, []int{1, 2, 3, 4, 5, 6, 7, 8, 9, 0}), 2) 904 assertEqual(t, slice, false, []int{1, 2, 6, 7}) 905 } 906 907 // TestPrefixedPragma verifies that Prefixed slices properly adopt pragmas from 908 // their underlying slices. 909 func TestPrefixedPragma(t *testing.T) { 910 slice := bigslice.Const(2, []int{0, 1, 2}, []string{"a", "b", "c"}) 911 slice = bigslice.Map(slice, func(i int, s string) (int, string) { 912 return i, s 913 }, bigslice.Exclusive) 914 slice = bigslice.Prefixed(slice, 2) 915 pragma, ok := slice.(bigslice.Pragma) 916 if !ok { 917 t.Fatal("Prefixed does not implement Pragma") 918 } 919 if !pragma.Exclusive() { 920 t.Error("Prefixed not Exclusive") 921 } 922 } 923 924 func TestScan(t *testing.T) { 925 const ( 926 N = 10000 927 Nshard = 10 928 ) 929 input := make([]int, N) 930 for i := range input { 931 input[i] = i 932 } 933 var mu sync.Mutex 934 output := make([]int, N) 935 shards := make([]int, Nshard) 936 slice := bigslice.Const(Nshard, input) 937 slice = bigslice.Scan(slice, func(shard int, scan *sliceio.Scanner) error { 938 mu.Lock() 939 defer mu.Unlock() 940 shards[shard]++ 941 var elem int 942 ctx := context.Background() 943 for scan.Scan(ctx, &elem) { 944 output[elem]++ 945 } 946 return scan.Err() 947 }) 948 n := len(run(context.Background(), t, slice)) 949 for i, got := range output { 950 if want := n; got != want { 951 t.Errorf("wrong count for output %d, got %v, want %v", i, got, want) 952 } 953 } 954 for i, got := range shards { 955 if want := n; got != want { 956 t.Errorf("wrong count for shard %d, got %v, want %v", i, got, want) 957 } 958 } 959 } 960 961 func TestPanic(t *testing.T) { 962 slice := bigslice.Const(1, []int{1, 2, 3}) 963 slice = bigslice.Map(slice, func(i int) int { 964 panic(i) 965 }) 966 fn := bigslice.Func(func() bigslice.Slice { return slice }) 967 ctx := context.Background() 968 for name, opt := range executors { 969 sess := exec.Start(opt) 970 // TODO(marius): faster teardown in bigmachine so that we can call this here. 971 // defer sess.Shutdown() 972 _, err := sess.Run(ctx, fn) 973 if err == nil { 974 t.Errorf("executor %s: expected error", name) 975 continue 976 } 977 if msg := err.Error(); !strings.Contains(msg, "panic while evaluating slice") { 978 t.Errorf("wrong error message %q", msg) 979 } 980 } 981 } 982 983 func TestEncodingError(t *testing.T) { 984 type ungobable struct { 985 x int 986 } 987 slice := bigslice.Const(1, []int{1, 2, 3}) 988 slice = bigslice.Map(slice, func(x int) (int, ungobable) { return x, ungobable{x} }) 989 slice = bigslice.Reduce(slice, func(a, e ungobable) ungobable { return ungobable{a.x + e.x} }) 990 991 scannerErrs := runError(context.Background(), t, slice) 992 for name, scannerErr := range scannerErrs { 993 // The local executor keeps things in memory by default. 994 // Note thaht while, currently the Bigmachine executors will by default 995 // run everything through gob, this is not at all a requirement. So this 996 // test may begin failing in the presence of future optimizatons. 997 if name == "Local" { 998 continue 999 } 1000 err := scannerErr.Err 1001 if err == nil { 1002 t.Errorf("%s: expected error", name) 1003 continue 1004 } 1005 expected := errors.E(errors.Remote, errors.Fatal) 1006 if !errors.Match(expected, err) { 1007 t.Errorf("error %s: expected Remote, Fatal", err) 1008 } 1009 if !strings.Contains(err.Error(), "gob: type bigslice_test.ungobable has no exported fields") { 1010 t.Errorf("error %s: expected gob error", err) 1011 } 1012 } 1013 } 1014 1015 func TestMetrics(t *testing.T) { 1016 counter := metrics.NewCounter() 1017 slice := bigslice.Const(1, []int{1, 2, 3}) 1018 slice = bigslice.Map(slice, func(ctx context.Context, i int) int { 1019 counter.Incr(metrics.ContextScope(ctx), int64(i)) 1020 return i 1021 }) 1022 fn := bigslice.Func(func() bigslice.Slice { return slice }) 1023 ctx := context.Background() 1024 for name, opt := range executors { 1025 sess := exec.Start(opt) 1026 res, err := sess.Run(ctx, fn) 1027 if err != nil { 1028 t.Errorf("executor %s: %v", name, err) 1029 continue 1030 } 1031 if got, want := counter.Value(res.Scope()), int64(6); got != want { 1032 t.Errorf("executor %s: got %v, want %v", name, got, want) 1033 } 1034 } 1035 1036 } 1037 1038 func ExampleConst() { 1039 slice := bigslice.Const(2, 1040 []int{0, 1, 2, 3}, 1041 []string{"zero", "one", "two", "three"}, 1042 ) 1043 slicetest.Print(slice) 1044 // Output: 1045 // 0 zero 1046 // 1 one 1047 // 2 two 1048 // 3 three 1049 } 1050 1051 func ExampleFilter() { 1052 slice := bigslice.Const(2, 1053 []int{0, 1, 2, 3, 4, 5}, 1054 []string{"zero", "one", "two", "three", "four", "five"}, 1055 ) 1056 slice = bigslice.Filter(slice, func(x int, s string) bool { 1057 return x%2 == 0 1058 }) 1059 slicetest.Print(slice) 1060 // Output: 1061 // 0 zero 1062 // 2 two 1063 // 4 four 1064 } 1065 1066 func ExampleFlatmap() { 1067 // Flatmap to split strings into words using different separators. The input 1068 // is of type Slice<string, string>: 1069 // - col0: the string 1070 // - col1: the separator 1071 // 1072 // The output is of type Slice<string, int>: 1073 // - col0: a word from the input strings 1074 // - col1: the length of the word 1075 slice := bigslice.Const(2, 1076 []string{ 1077 "Lorem ipsum dolor sit amet", 1078 "consectetur:adipiscing", 1079 "elit", 1080 "sed.do.eiusmod.tempor.incididunt", 1081 }, 1082 []string{" ", ":", ";", "."}, // Separators. 1083 ) 1084 slice = bigslice.Flatmap(slice, func(s, sep string) ([]string, []int) { 1085 split := strings.Split(s, sep) 1086 lengths := make([]int, len(split)) 1087 for i := range lengths { 1088 lengths[i] = len(split[i]) 1089 } 1090 return split, lengths 1091 }) 1092 slicetest.Print(slice) 1093 // Output: 1094 // Lorem 5 1095 // adipiscing 10 1096 // amet 4 1097 // consectetur 11 1098 // do 2 1099 // dolor 5 1100 // eiusmod 7 1101 // elit 4 1102 // incididunt 10 1103 // ipsum 5 1104 // sed 3 1105 // sit 3 1106 // tempor 6 1107 } 1108 1109 func ExampleFold() { 1110 // Fold over the input Slice<string, int, string> to accumulate a struct 1111 // holding: 1112 // - the sum of the integers in col1. 1113 // - the product of the integers in col1. 1114 // - the longest string encountered in col2. 1115 slice := bigslice.Const(2, 1116 []string{"c", "a", "b", "c", "c", "b", "a", "a", "a", "a", "c"}, 1117 []int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, 1118 []string{ 1119 "Lorem", 1120 "ipsum", 1121 "dolor", 1122 "sit", 1123 "amet", 1124 "consectetur", 1125 "adipiscing", 1126 "elit", 1127 "sed", 1128 "do", 1129 "eiusmod", 1130 }, 1131 ) 1132 type accum struct { 1133 ready bool 1134 // sum is the sum of integers in the second column. 1135 sum int 1136 // product is the product of integers in the second column. 1137 product int 1138 // longest is the longest string in the third column. 1139 longest string 1140 } 1141 slice = bigslice.Fold(slice, func(acc accum, i int, s string) accum { 1142 if !acc.ready { 1143 // Initialize product as the multiplicative identity, 1. 1144 acc.product = 1 1145 acc.ready = true 1146 } 1147 acc.sum += i 1148 acc.product *= i 1149 if len(acc.longest) < len(s) { 1150 acc.longest = s 1151 } 1152 return acc 1153 }) 1154 slicetest.Print(slice) 1155 // Output: 1156 // a {true 36 10080 adipiscing} 1157 // b {true 9 18 consectetur} 1158 // c {true 21 220 eiusmod} 1159 } 1160 1161 func ExampleHead() { 1162 // Use one shard, as Head operates per shard. 1163 slice := bigslice.Const(1, 1164 []int{0, 1, 2, 3, 4, 5}, 1165 []string{"zero", "one", "two", "three", "four", "five"}, 1166 ) 1167 slice = bigslice.Head(slice, 3) 1168 slicetest.Print(slice) 1169 // Output: 1170 // 0 zero 1171 // 1 one 1172 // 2 two 1173 } 1174 1175 func ExampleMap() { 1176 // Map an input of Slice<int, string>...: 1177 // - col0: an integer 1178 // - col1: a label for that integer 1179 // 1180 // ... to a Slice<int, string, int, string>: 1181 // - col0: original integer 1182 // - col1: original label 1183 // - col2: square of original integer 1184 // - col3: original label with ".squared" appended 1185 slice := bigslice.Const(2, 1186 []int{0, 1, 2, 3}, 1187 []string{"zero", "one", "two", "three"}, 1188 ) 1189 slice = bigslice.Map(slice, func(x int, s string) (int, string, int, string) { 1190 return x, s, x * x, s + ".squared" 1191 }) 1192 slicetest.Print(slice) 1193 // Output: 1194 // 0 zero 0 zero.squared 1195 // 1 one 1 one.squared 1196 // 2 two 4 two.squared 1197 // 3 three 9 three.squared 1198 } 1199 1200 func ExamplePrefixed() { 1201 // Count the number of pets of the same type and name by using Prefixed to 1202 // make a slice with the type and name columns as the key, then using Reduce 1203 // to count the number of elements that have that key. 1204 slice := bigslice.Const(2, 1205 []string{ 1206 "dog", 1207 "dog", 1208 "cat", 1209 "cat", 1210 "cat", 1211 "fish", 1212 "dog", 1213 "dog", 1214 "cat", 1215 "fish", 1216 "fish", 1217 }, 1218 []string{ 1219 "spot", 1220 "spot", 1221 "mittens", 1222 "socks", 1223 "socks", 1224 "nemo", 1225 "lassie", 1226 "spot", 1227 "mittens", 1228 "nemo", 1229 "dory", 1230 }, 1231 []int{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, 1232 ) 1233 slice = bigslice.Prefixed(slice, 2) 1234 slice = bigslice.Reduce(slice, func(a, b int) int { return a + b }) 1235 slicetest.Print(slice) 1236 // Output: 1237 // cat mittens 2 1238 // cat socks 2 1239 // dog lassie 1 1240 // dog spot 3 1241 // fish dory 1 1242 // fish nemo 2 1243 } 1244 1245 func ExampleReaderFunc() { 1246 // Use ReaderFunc to make an evenly sharded Slice<int, string> from the 1247 // alphabet: 1248 // - col0: the 1-indexed index of the letter in the alphabet 1249 // - col1: the letter 1250 const numShards = 6 1251 const alphabet = "abcdefghijklmnopqrstuvwxyz" 1252 type state struct { 1253 // next is the index of the next element of the alphabet to be read. 1254 next int 1255 } 1256 slice := bigslice.ReaderFunc(numShards, 1257 func(shard int, s *state, is []int, ss []string) (int, error) { 1258 // Each shard will handle a portion of the alphabet. 1259 // Shard 0 reads letters 1, 7, 13, .... 1260 // Shard 1 reads letters 2, 8, 14, .... 1261 // ... 1262 // Shard 5 reads letters 6, 12, 18, .... 1263 if s.next == 0 { 1264 // This is the first call, so we initialize our state. 1265 s.next = shard + 1 1266 } 1267 for n := 0; ; n++ { 1268 if len(alphabet) < s.next { 1269 // Our shard is complete, so return EOF. 1270 return n, sliceio.EOF 1271 } 1272 if n == len(is) { 1273 // We have filled the passed buffers, so there is nothing 1274 // left to do in this invocation. 1275 return n, nil 1276 } 1277 is[n] = s.next 1278 ss[n] = string(alphabet[s.next-1]) 1279 s.next += numShards 1280 } 1281 }) 1282 slicetest.Print(slice) 1283 // Output: 1284 // 1 a 1285 // 2 b 1286 // 3 c 1287 // 4 d 1288 // 5 e 1289 // 6 f 1290 // 7 g 1291 // 8 h 1292 // 9 i 1293 // 10 j 1294 // 11 k 1295 // 12 l 1296 // 13 m 1297 // 14 n 1298 // 15 o 1299 // 16 p 1300 // 17 q 1301 // 18 r 1302 // 19 s 1303 // 20 t 1304 // 21 u 1305 // 22 v 1306 // 23 w 1307 // 24 x 1308 // 25 y 1309 // 26 z 1310 } 1311 1312 func ExampleScan() { 1313 // Use Scan to write a file for each shard of the input. Each file will 1314 // contain a line for each row in the shard. 1315 const numShards = 2 1316 slice := bigslice.Const(numShards, 1317 []string{"a", "b", "c", "a", "b", "c"}, 1318 []int{3, 3, 2, 2, 1, 1}, 1319 ) 1320 // For this simple example, use shared memory to store the paths to these 1321 // files so that we can easily aggregate the files for output. If we were 1322 // distributing this computation across machines without access to shared 1323 // memory, we'd need to use a different mechanism, e.g. write files to a 1324 // common backing store with a known prefix. 1325 shardPaths := make([]string, numShards) 1326 slice = bigslice.Scan(slice, 1327 func(shard int, scanner *sliceio.Scanner) error { 1328 file, err := ioutil.TempFile("", "example-scan") 1329 if err != nil { 1330 return fmt.Errorf("could not open temp file: %v", err) 1331 } 1332 shardPaths[shard] = file.Name() 1333 var ( 1334 s string 1335 x int 1336 ) 1337 for scanner.Scan(context.Background(), &s, &x) { 1338 // Write a line in the file with the labeled elements of the 1339 // row. 1340 line := fmt.Sprintf("s:%s x:%d\n", s, x) 1341 if _, err = file.WriteString(line); err != nil { 1342 return fmt.Errorf("error writing file %s: %v", file.Name(), err) 1343 } 1344 } 1345 if err = file.Close(); err != nil { 1346 return fmt.Errorf("error closing file: %v", err) 1347 } 1348 return scanner.Err() 1349 }, 1350 ) 1351 // Print the resulting slice. This forces evaluation of the slice. Notice 1352 // that this prints no output because slice is empty. Scanning consumes the 1353 // slice. 1354 fmt.Println("# slice") 1355 slicetest.Print(slice) 1356 1357 // slicetest.Print evaluates the slice, so we now make sure to clean up 1358 // after ourselves. 1359 for _, path := range shardPaths { 1360 defer os.Remove(path) 1361 } 1362 fmt.Println("# lines by shard") 1363 for shard, path := range shardPaths { 1364 fmt.Printf("## shard %d\n", shard) 1365 // Read and sort the lines for deterministic output. 1366 var lines []string 1367 file, err := os.Open(path) 1368 if err != nil { 1369 log.Fatalf("error opening %s for reading: %v", path, err) 1370 } 1371 scanner := bufio.NewScanner(file) 1372 for scanner.Scan() { 1373 lines = append(lines, scanner.Text()) 1374 } 1375 if scannerErr := scanner.Err(); scannerErr != nil { 1376 log.Fatalf("error scanning %s: %v", path, scannerErr) 1377 } 1378 sort.Strings(lines) 1379 for _, line := range lines { 1380 fmt.Println(line) 1381 } 1382 } 1383 // Output: 1384 // # slice 1385 // # lines by shard 1386 // ## shard 0 1387 // s:a x:3 1388 // s:b x:3 1389 // s:c x:2 1390 // ## shard 1 1391 // s:a x:2 1392 // s:b x:1 1393 // s:c x:1 1394 } 1395 1396 func ExampleWriterFunc() { 1397 // Use WriterFunc to write a file for each shard of the input. Each file 1398 // will contain a line for each row in the shard. 1399 const numShards = 2 1400 slice := bigslice.Const(numShards, 1401 []string{"a", "b", "c", "a", "b", "c"}, 1402 []int{3, 3, 2, 2, 1, 1}, 1403 ) 1404 // For this simple example, use shared memory to store the paths to these 1405 // files so that we can easily aggregate the files for output. If we were 1406 // distributing this computation across machines without access to shared 1407 // memory, we'd need to use a different mechanism, e.g. write files to a 1408 // common backing store with a known prefix. 1409 shardPaths := make([]string, numShards) 1410 type writeState struct { 1411 file *os.File 1412 } 1413 slice = bigslice.WriterFunc(slice, 1414 func(shard int, state *writeState, readErr error, ss []string, xs []int) error { 1415 if state.file == nil { 1416 // First call; initialize state. 1417 var err error 1418 if state.file, err = ioutil.TempFile("", "example-writer-func"); err != nil { 1419 return fmt.Errorf("could not open temp file: %v", err) 1420 } 1421 shardPaths[shard] = state.file.Name() 1422 } 1423 for i := range ss { 1424 // We can safely assume that ss and xs are of equal length. 1425 s := ss[i] 1426 x := xs[i] 1427 // Write a line in the file with the labeled elements of the 1428 // row. 1429 line := fmt.Sprintf("s:%s x:%d\n", s, x) 1430 if _, err := state.file.WriteString(line); err != nil { 1431 return fmt.Errorf("error writing file: %v", err) 1432 } 1433 } 1434 if readErr != nil { 1435 // No more data is coming, so we close our file. 1436 if err := state.file.Close(); err != nil { 1437 return fmt.Errorf("error closing file: %v", err) 1438 } 1439 } 1440 return nil 1441 }, 1442 ) 1443 // Note that the slice passes through unadulterated. 1444 fmt.Println("# slice") 1445 slicetest.Print(slice) 1446 1447 // slicetest.Print evaluates the slice, so we now make sure to clean up 1448 // after ourselves. 1449 for _, path := range shardPaths { 1450 defer os.Remove(path) 1451 } 1452 fmt.Println("# lines by shard") 1453 for shard, path := range shardPaths { 1454 fmt.Printf("## shard %d\n", shard) 1455 // Read and sort the lines for deterministic output. 1456 var lines []string 1457 file, err := os.Open(path) 1458 if err != nil { 1459 log.Fatalf("error opening %s for reading: %v", path, err) 1460 } 1461 scanner := bufio.NewScanner(file) 1462 for scanner.Scan() { 1463 lines = append(lines, scanner.Text()) 1464 } 1465 if scannerErr := scanner.Err(); scannerErr != nil { 1466 log.Fatalf("error scanning %s: %v", path, scannerErr) 1467 } 1468 sort.Strings(lines) 1469 for _, line := range lines { 1470 fmt.Println(line) 1471 } 1472 } 1473 // Output: 1474 // # slice 1475 // a 2 1476 // a 3 1477 // b 1 1478 // b 3 1479 // c 1 1480 // c 2 1481 // # lines by shard 1482 // ## shard 0 1483 // s:a x:3 1484 // s:b x:3 1485 // s:c x:2 1486 // ## shard 1 1487 // s:a x:2 1488 // s:b x:1 1489 // s:c x:1 1490 }