github.com/grailbio/bigslice@v0.0.0-20230519005545-30c4c12152ad/sliceio/spiller.go (about)

     1  // Copyright 2018 GRAIL, Inc. All rights reserved.
     2  // Use of this source code is governed by the Apache 2.0
     3  // license that can be found in the LICENSE file.
     4  
     5  package sliceio
     6  
     7  import (
     8  	"context"
     9  	"fmt"
    10  	"io"
    11  	"io/ioutil"
    12  	"math/rand"
    13  	"os"
    14  	"path/filepath"
    15  
    16  	"github.com/grailbio/base/backgroundcontext"
    17  	"github.com/grailbio/base/file"
    18  	"github.com/grailbio/bigslice/frame"
    19  )
    20  
    21  // SpillBatchSize determines the amount of batching used in each
    22  // spill file. A single read of a spill file produces this many rows.
    23  // SpillBatchSize then trades off memory footprint for encoding size.
    24  var SpillBatchSize = defaultChunksize
    25  
    26  // A Spiller manages a set of spill files.
    27  type Spiller string
    28  
    29  // NewSpiller creates and returns a new spiller backed by a
    30  // temporary directory. Spillers do not guarantee that the order
    31  // of spillers returned matches the order of spills.
    32  func NewSpiller(name string) (Spiller, error) {
    33  	dir, err := ioutil.TempDir("", fmt.Sprintf("spiller-%s-", name))
    34  	if err != nil {
    35  		return "", err
    36  	}
    37  	return Spiller(dir), nil
    38  }
    39  
    40  // Spill spills the provided frame to a new file in the spiller.
    41  // Spill returns the file's encoded size, or an error. The frame
    42  // is encoded in batches of SpillBatchSize.
    43  func (dir Spiller) Spill(frame frame.Frame) (int, error) {
    44  	// Generate a random path and divide it into a hierarchy
    45  	// of paths so that any particular directory does not get
    46  	// too big. We'll use 3 levels of hierarchy with a fanout
    47  	// of 255.
    48  	dirPath := string(dir)
    49  	for i := 0; i < 3; i++ {
    50  		dirPath = filepath.Join(dirPath, fmt.Sprintf("%02x", rand.Intn(255)))
    51  	}
    52  	_ = os.MkdirAll(dirPath, 0777)
    53  	f, err := ioutil.TempFile(dirPath, "spill-")
    54  	if err != nil {
    55  		return 0, err
    56  	}
    57  	// TODO(marius): buffer?
    58  	enc := NewEncodingWriter(f)
    59  	for frame.Len() > 0 {
    60  		n := SpillBatchSize
    61  		m := frame.Len()
    62  		if m < n {
    63  			n = m
    64  		}
    65  		if writeErr := enc.Write(context.Background(), frame.Slice(0, n)); writeErr != nil {
    66  			return 0, writeErr
    67  		}
    68  		frame = frame.Slice(n, m)
    69  	}
    70  	size, err := f.Seek(0, io.SeekCurrent)
    71  	if err != nil {
    72  		return 0, err
    73  	}
    74  	if err := f.Close(); err != nil {
    75  		return 0, err
    76  	}
    77  	return int(size), nil
    78  }
    79  
    80  // Readers returns a ReadCloser for each spiller file.
    81  func (dir Spiller) Readers() ([]ReadCloser, error) {
    82  	var paths []string
    83  	// These are always on local paths, so background context is ok.
    84  	list := file.List(backgroundcontext.Get(), string(dir), true)
    85  	for list.Scan() {
    86  		if list.IsDir() {
    87  			continue
    88  		}
    89  		paths = append(paths, list.Path())
    90  	}
    91  	if err := list.Err(); err != nil {
    92  		return nil, err
    93  	}
    94  	readers := make([]ReadCloser, len(paths))
    95  	for i, path := range paths {
    96  		f, err := os.Open(path)
    97  		if err != nil {
    98  			for j := 0; j < i; j++ {
    99  				readers[j].Close()
   100  			}
   101  			return nil, err
   102  		}
   103  		readers[i] = ReaderWithCloseFunc{NewDecodingReader(f), f.Close}
   104  	}
   105  	return readers, nil
   106  }
   107  
   108  // ClosingReaders returns a reader for each spiller file. The readers close the
   109  // underlying file when Read returns a non-nil error (otherwise the underlying
   110  // file resource will leak).
   111  func (dir Spiller) ClosingReaders() ([]Reader, error) {
   112  	readers, err := dir.Readers()
   113  	if err != nil {
   114  		return nil, err
   115  	}
   116  	cReaders := make([]Reader, len(readers))
   117  	for i, r := range readers {
   118  		cReaders[i] = NewClosingReader(r)
   119  	}
   120  	return cReaders, nil
   121  }
   122  
   123  // Cleanup removes the spiller's temporary files. It is safe to call
   124  // Cleanup after Readers(), but before reading is done.
   125  func (dir Spiller) Cleanup() error {
   126  	return os.RemoveAll(string(dir))
   127  }