github.com/grailbio/bigslice@v0.0.0-20230519005545-30c4c12152ad/sliceio/spiller.go (about) 1 // Copyright 2018 GRAIL, Inc. All rights reserved. 2 // Use of this source code is governed by the Apache 2.0 3 // license that can be found in the LICENSE file. 4 5 package sliceio 6 7 import ( 8 "context" 9 "fmt" 10 "io" 11 "io/ioutil" 12 "math/rand" 13 "os" 14 "path/filepath" 15 16 "github.com/grailbio/base/backgroundcontext" 17 "github.com/grailbio/base/file" 18 "github.com/grailbio/bigslice/frame" 19 ) 20 21 // SpillBatchSize determines the amount of batching used in each 22 // spill file. A single read of a spill file produces this many rows. 23 // SpillBatchSize then trades off memory footprint for encoding size. 24 var SpillBatchSize = defaultChunksize 25 26 // A Spiller manages a set of spill files. 27 type Spiller string 28 29 // NewSpiller creates and returns a new spiller backed by a 30 // temporary directory. Spillers do not guarantee that the order 31 // of spillers returned matches the order of spills. 32 func NewSpiller(name string) (Spiller, error) { 33 dir, err := ioutil.TempDir("", fmt.Sprintf("spiller-%s-", name)) 34 if err != nil { 35 return "", err 36 } 37 return Spiller(dir), nil 38 } 39 40 // Spill spills the provided frame to a new file in the spiller. 41 // Spill returns the file's encoded size, or an error. The frame 42 // is encoded in batches of SpillBatchSize. 43 func (dir Spiller) Spill(frame frame.Frame) (int, error) { 44 // Generate a random path and divide it into a hierarchy 45 // of paths so that any particular directory does not get 46 // too big. We'll use 3 levels of hierarchy with a fanout 47 // of 255. 48 dirPath := string(dir) 49 for i := 0; i < 3; i++ { 50 dirPath = filepath.Join(dirPath, fmt.Sprintf("%02x", rand.Intn(255))) 51 } 52 _ = os.MkdirAll(dirPath, 0777) 53 f, err := ioutil.TempFile(dirPath, "spill-") 54 if err != nil { 55 return 0, err 56 } 57 // TODO(marius): buffer? 58 enc := NewEncodingWriter(f) 59 for frame.Len() > 0 { 60 n := SpillBatchSize 61 m := frame.Len() 62 if m < n { 63 n = m 64 } 65 if writeErr := enc.Write(context.Background(), frame.Slice(0, n)); writeErr != nil { 66 return 0, writeErr 67 } 68 frame = frame.Slice(n, m) 69 } 70 size, err := f.Seek(0, io.SeekCurrent) 71 if err != nil { 72 return 0, err 73 } 74 if err := f.Close(); err != nil { 75 return 0, err 76 } 77 return int(size), nil 78 } 79 80 // Readers returns a ReadCloser for each spiller file. 81 func (dir Spiller) Readers() ([]ReadCloser, error) { 82 var paths []string 83 // These are always on local paths, so background context is ok. 84 list := file.List(backgroundcontext.Get(), string(dir), true) 85 for list.Scan() { 86 if list.IsDir() { 87 continue 88 } 89 paths = append(paths, list.Path()) 90 } 91 if err := list.Err(); err != nil { 92 return nil, err 93 } 94 readers := make([]ReadCloser, len(paths)) 95 for i, path := range paths { 96 f, err := os.Open(path) 97 if err != nil { 98 for j := 0; j < i; j++ { 99 readers[j].Close() 100 } 101 return nil, err 102 } 103 readers[i] = ReaderWithCloseFunc{NewDecodingReader(f), f.Close} 104 } 105 return readers, nil 106 } 107 108 // ClosingReaders returns a reader for each spiller file. The readers close the 109 // underlying file when Read returns a non-nil error (otherwise the underlying 110 // file resource will leak). 111 func (dir Spiller) ClosingReaders() ([]Reader, error) { 112 readers, err := dir.Readers() 113 if err != nil { 114 return nil, err 115 } 116 cReaders := make([]Reader, len(readers)) 117 for i, r := range readers { 118 cReaders[i] = NewClosingReader(r) 119 } 120 return cReaders, nil 121 } 122 123 // Cleanup removes the spiller's temporary files. It is safe to call 124 // Cleanup after Readers(), but before reading is done. 125 func (dir Spiller) Cleanup() error { 126 return os.RemoveAll(string(dir)) 127 }