github.com/grailbio/bigslice@v0.0.0-20230519005545-30c4c12152ad/cache.go (about) 1 // Copyright 2018 GRAIL, Inc. All rights reserved. 2 // Use of this source code is governed by the Apache 2.0 3 // license that can be found in the LICENSE file. 4 5 package bigslice 6 7 import ( 8 "context" 9 10 "github.com/grailbio/bigslice/internal/slicecache" 11 "github.com/grailbio/bigslice/slicefunc" 12 "github.com/grailbio/bigslice/sliceio" 13 "github.com/grailbio/bigslice/slicetype" 14 ) 15 16 type cacheSlice struct { 17 name Name 18 Slice 19 cache *slicecache.FileShardCache 20 } 21 22 var _ slicecache.Cacheable = (*cacheSlice)(nil) 23 24 func (c *cacheSlice) Name() Name { return c.name } 25 func (c *cacheSlice) NumDep() int { return 1 } 26 func (c *cacheSlice) Dep(i int) Dep { return Dep{c.Slice, false, nil, false} } 27 func (*cacheSlice) Combiner() slicefunc.Func { return slicefunc.Nil } 28 func (c *cacheSlice) Reader(shard int, deps []sliceio.Reader) sliceio.Reader { return deps[0] } 29 30 func (c *cacheSlice) Cache() slicecache.ShardCache { return c.cache } 31 32 // Cache caches the output of a slice to the given file prefix. 33 // Cached data are stored as "prefix-nnnn-of-mmmm" for shards nnnn of 34 // mmmm. When the slice is computed, each shard is encoded and 35 // written to a separate file with this prefix. If all shards exist, 36 // then Cache shortcuts computation and instead reads directly from 37 // the previously computed output. The user must guarantee cache 38 // consistency: if the cache could be invalid (e.g., because of 39 // code changes), the user is responsible for removing existing 40 // cached files, or picking a different prefix that correctly 41 // represents the operation to be cached. 42 // 43 // Cache uses GRAIL's file library, so prefix may refer to URLs to a 44 // distributed object store such as S3. 45 func Cache(ctx context.Context, slice Slice, prefix string) Slice { 46 shardCache := slicecache.NewFileShardCache(ctx, prefix, slice.NumShard()) 47 shardCache.RequireAllCached() 48 return &cacheSlice{MakeName("cache"), slice, shardCache} 49 } 50 51 // CachePartial caches the output of the slice to the given file 52 // prefix (it uses the same file naming scheme as Cache). However, unlike 53 // Cache, if CachePartial finds incomplete cached results (from an 54 // earlier failed or interrupted run), it will use them and recompute only 55 // the missing data. 56 // 57 // WARNING: The user is responsible for ensuring slice's contents are 58 // deterministic between bigslice runs. If keys are non-deterministic, for 59 // example due to pseudorandom seeding based on time, or reading the state 60 // of a modifiable file in S3, CachePartial produces corrupt results. 61 // 62 // As with Cache, the user must guarantee cache consistency. 63 func CachePartial(ctx context.Context, slice Slice, prefix string) Slice { 64 shardCache := slicecache.NewFileShardCache(ctx, prefix, slice.NumShard()) 65 return &cacheSlice{MakeName("cachepartial"), slice, shardCache} 66 } 67 68 type readCacheSlice struct { 69 slicetype.Type 70 name Name 71 numShard int 72 cache *slicecache.FileShardCache 73 } 74 75 func (r *readCacheSlice) Name() Name { return r.name } 76 func (r *readCacheSlice) NumShard() int { return r.numShard } 77 func (*readCacheSlice) ShardType() ShardType { return HashShard } 78 func (*readCacheSlice) NumDep() int { return 0 } 79 func (*readCacheSlice) Dep(i int) Dep { panic("no deps") } 80 func (*readCacheSlice) Combiner() slicefunc.Func { return slicefunc.Nil } 81 82 func (r *readCacheSlice) Reader(shard int, _ []sliceio.Reader) sliceio.Reader { 83 return r.cache.CacheReader(shard) 84 } 85 86 // ReadCache reads from an existing cache but does not write any cache itself. 87 // This may be useful if you want to reuse a cache from a previous computation 88 // and fail if it does not exist. typ is the type of the cached and returned 89 // slice. You may construct typ using slicetype.New or pass a Slice, which 90 // embeds slicetype.Type. 91 func ReadCache(ctx context.Context, typ slicetype.Type, numShard int, prefix string) Slice { 92 shardCache := slicecache.NewFileShardCache(ctx, prefix, numShard) 93 shardCache.RequireAllCached() 94 return &readCacheSlice{typ, MakeName("readcache"), numShard, shardCache} 95 }