github.com/grailbio/bigslice@v0.0.0-20230519005545-30c4c12152ad/cache.go (about)

     1  // Copyright 2018 GRAIL, Inc. All rights reserved.
     2  // Use of this source code is governed by the Apache 2.0
     3  // license that can be found in the LICENSE file.
     4  
     5  package bigslice
     6  
     7  import (
     8  	"context"
     9  
    10  	"github.com/grailbio/bigslice/internal/slicecache"
    11  	"github.com/grailbio/bigslice/slicefunc"
    12  	"github.com/grailbio/bigslice/sliceio"
    13  	"github.com/grailbio/bigslice/slicetype"
    14  )
    15  
    16  type cacheSlice struct {
    17  	name Name
    18  	Slice
    19  	cache *slicecache.FileShardCache
    20  }
    21  
    22  var _ slicecache.Cacheable = (*cacheSlice)(nil)
    23  
    24  func (c *cacheSlice) Name() Name                                             { return c.name }
    25  func (c *cacheSlice) NumDep() int                                            { return 1 }
    26  func (c *cacheSlice) Dep(i int) Dep                                          { return Dep{c.Slice, false, nil, false} }
    27  func (*cacheSlice) Combiner() slicefunc.Func                                 { return slicefunc.Nil }
    28  func (c *cacheSlice) Reader(shard int, deps []sliceio.Reader) sliceio.Reader { return deps[0] }
    29  
    30  func (c *cacheSlice) Cache() slicecache.ShardCache { return c.cache }
    31  
    32  // Cache caches the output of a slice to the given file prefix.
    33  // Cached data are stored as "prefix-nnnn-of-mmmm" for shards nnnn of
    34  // mmmm. When the slice is computed, each shard is encoded and
    35  // written to a separate file with this prefix. If all shards exist,
    36  // then Cache shortcuts computation and instead reads directly from
    37  // the previously computed output. The user must guarantee cache
    38  // consistency: if the cache could be invalid (e.g., because of
    39  // code changes), the user is responsible for removing existing
    40  // cached files, or picking a different prefix that correctly
    41  // represents the operation to be cached.
    42  //
    43  // Cache uses GRAIL's file library, so prefix may refer to URLs to a
    44  // distributed object store such as S3.
    45  func Cache(ctx context.Context, slice Slice, prefix string) Slice {
    46  	shardCache := slicecache.NewFileShardCache(ctx, prefix, slice.NumShard())
    47  	shardCache.RequireAllCached()
    48  	return &cacheSlice{MakeName("cache"), slice, shardCache}
    49  }
    50  
    51  // CachePartial caches the output of the slice to the given file
    52  // prefix (it uses the same file naming scheme as Cache). However, unlike
    53  // Cache, if CachePartial finds incomplete cached results (from an
    54  // earlier failed or interrupted run), it will use them and recompute only
    55  // the missing data.
    56  //
    57  // WARNING: The user is responsible for ensuring slice's contents are
    58  // deterministic between bigslice runs. If keys are non-deterministic, for
    59  // example due to pseudorandom seeding based on time, or reading the state
    60  // of a modifiable file in S3, CachePartial produces corrupt results.
    61  //
    62  // As with Cache, the user must guarantee cache consistency.
    63  func CachePartial(ctx context.Context, slice Slice, prefix string) Slice {
    64  	shardCache := slicecache.NewFileShardCache(ctx, prefix, slice.NumShard())
    65  	return &cacheSlice{MakeName("cachepartial"), slice, shardCache}
    66  }
    67  
    68  type readCacheSlice struct {
    69  	slicetype.Type
    70  	name     Name
    71  	numShard int
    72  	cache    *slicecache.FileShardCache
    73  }
    74  
    75  func (r *readCacheSlice) Name() Name             { return r.name }
    76  func (r *readCacheSlice) NumShard() int          { return r.numShard }
    77  func (*readCacheSlice) ShardType() ShardType     { return HashShard }
    78  func (*readCacheSlice) NumDep() int              { return 0 }
    79  func (*readCacheSlice) Dep(i int) Dep            { panic("no deps") }
    80  func (*readCacheSlice) Combiner() slicefunc.Func { return slicefunc.Nil }
    81  
    82  func (r *readCacheSlice) Reader(shard int, _ []sliceio.Reader) sliceio.Reader {
    83  	return r.cache.CacheReader(shard)
    84  }
    85  
    86  // ReadCache reads from an existing cache but does not write any cache itself.
    87  // This may be useful if you want to reuse a cache from a previous computation
    88  // and fail if it does not exist. typ is the type of the cached and returned
    89  // slice. You may construct typ using slicetype.New or pass a Slice, which
    90  // embeds slicetype.Type.
    91  func ReadCache(ctx context.Context, typ slicetype.Type, numShard int, prefix string) Slice {
    92  	shardCache := slicecache.NewFileShardCache(ctx, prefix, numShard)
    93  	shardCache.RequireAllCached()
    94  	return &readCacheSlice{typ, MakeName("readcache"), numShard, shardCache}
    95  }