github.com/grailbio/base@v0.0.11/file/addfs/unzipfs/unzipfs.go (about) 1 package unzipfs 2 3 import ( 4 "archive/zip" 5 "compress/flate" 6 "context" 7 stderrors "errors" 8 "fmt" 9 "io" 10 "io/fs" 11 "path" 12 "runtime" 13 14 "github.com/grailbio/base/errors" 15 "github.com/grailbio/base/file/addfs" 16 "github.com/grailbio/base/file/fsnode" 17 "github.com/grailbio/base/grail/biofs/biofseventlog" 18 "github.com/grailbio/base/ioctx" 19 "github.com/grailbio/base/ioctx/fsctx" 20 "github.com/grailbio/base/log" 21 "github.com/grailbio/base/morebufio" 22 "github.com/grailbio/base/sync/loadingcache" 23 ) 24 25 type unzipFunc struct{} 26 27 // Func is an addfs.PerNodeFunc that presents zip file contents as a subdirectory tree. 28 // Users can access contents in .../myfile.zip/unzip/, for example. 29 // 30 // The file need not have extension .zip. Func.Apply reads the file header and if it's not 31 // a supported zip file the unzip/ directory is omitted. 32 var Func unzipFunc 33 34 var _ addfs.PerNodeFunc = Func 35 36 func (unzipFunc) Apply(ctx context.Context, node fsnode.T) ([]fsnode.T, error) { 37 zipLeaf, ok := node.(fsnode.Leaf) 38 if !ok { 39 return nil, nil 40 } 41 info := fsnode.NewDirInfo("unzip").WithCacheableFor(fsnode.CacheableFor(zipLeaf)) 42 parent, err := parentFromLeaf(ctx, info, zipLeaf) 43 if err != nil { 44 return nil, err 45 } 46 if parent == nil { 47 return nil, nil 48 } 49 return []fsnode.T{parent}, nil 50 } 51 52 type readerHandle struct { 53 *zip.Reader 54 ioctx.Closer 55 leaf fsnode.Leaf 56 } 57 58 func finalizeHandle(h *readerHandle) { 59 if err := h.Close(context.Background()); err != nil { 60 log.Error.Printf("unzipfs: error closing handle: %v", err) 61 } 62 } 63 64 // parentFromLeaf opens zipLeaf to determine if it's a zip file and returns a Parent if so. 65 // Returns nil, nil in cases where the file is not supported (like, not a zip file). 66 // TODO: Consider exposing more public APIs like this fsnode.Leaf -> fsnode.Parent and/or 67 // *zip.Reader -> fsnode.Parent. 68 func parentFromLeaf(ctx context.Context, parentInfo fsnode.FileInfo, zipLeaf fsnode.Leaf) (fsnode.Parent, error) { 69 zipFile, err := fsnode.Open(ctx, zipLeaf) 70 if err != nil { 71 return nil, errors.E(err, "opening for unzip") 72 } 73 handle := readerHandle{Closer: zipFile, leaf: zipLeaf} 74 // TODO: More reliable/explicit cleanup. Refcount? 75 runtime.SetFinalizer(&handle, finalizeHandle) 76 info, err := zipFile.Stat(ctx) 77 if err != nil { 78 return nil, errors.E(err, "stat-ing for unzip") 79 } 80 rAt, ok := zipFile.(ioctx.ReaderAt) 81 if !ok { 82 log.Info.Printf("zipfs: random access not supported: %s, returning empty dir", zipLeaf.Info().Name()) 83 // TODO: Some less efficient fallback path? Try seeking? 84 return nil, nil 85 } 86 // Buffer makes header read much faster when underlying file is high latency, like S3. 87 // Of course there's a tradeoff where very small zip files (with much smaller headers) will 88 // not be read as lazily, but the speedup is significant for S3. 89 rAt = morebufio.NewReaderAtSize(rAt, 1024*1024) 90 handle.Reader, err = zip.NewReader(ioctx.ToStdReaderAt(ctx, rAt), info.Size()) 91 if err != nil { 92 if stderrors.Is(err, zip.ErrFormat) || 93 stderrors.Is(err, zip.ErrAlgorithm) || 94 stderrors.Is(err, zip.ErrChecksum) { 95 log.Info.Printf("zipfs: not a valid zip file: %s, returning empty dir", zipLeaf.Info().Name()) 96 return nil, nil 97 } 98 return nil, errors.E(err, "initializing zip reader") 99 } 100 return fsnode.NewParent(parentInfo, &handleChildGen{r: &handle, pathPrefix: "."}), nil 101 } 102 103 type handleChildGen struct { 104 r *readerHandle 105 pathPrefix string 106 children loadingcache.Value 107 } 108 109 func (g *handleChildGen) GenerateChildren(ctx context.Context) ([]fsnode.T, error) { 110 biofseventlog.UsedFeature("unzipfs.children") 111 var children []fsnode.T 112 err := g.children.GetOrLoad(ctx, &children, func(ctx context.Context, opts *loadingcache.LoadOpts) error { 113 entries, err := fs.ReadDir(g.r, g.pathPrefix) 114 if err != nil { 115 return err 116 } 117 children = make([]fsnode.T, len(entries)) 118 cacheFor := fsnode.CacheableFor(g.r.leaf) 119 for i, entry := range entries { 120 stat, err := entry.Info() // Immediate (no additional file read) as of go1.17. 121 if err != nil { 122 return errors.E(err, fmt.Sprintf("stat: %s", entry.Name())) 123 } 124 childInfo := fsnode.CopyFileInfo(stat).WithCacheableFor(cacheFor) 125 fullName := path.Join(g.pathPrefix, entry.Name()) 126 if entry.IsDir() { 127 children[i] = fsnode.NewParent(childInfo, &handleChildGen{r: g.r, pathPrefix: fullName}) 128 } else { 129 children[i] = zipFileLeaf{g.r, childInfo, fullName} 130 } 131 } 132 opts.CacheFor(cacheFor) 133 return nil 134 }) 135 if err != nil { 136 return nil, errors.E(err, fmt.Sprintf("listing path: %s", g.pathPrefix)) 137 } 138 return children, nil 139 } 140 141 type zipFileLeaf struct { 142 r *readerHandle 143 fsnode.FileInfo 144 zipName string 145 } 146 147 var _ fsnode.Leaf = (*zipFileLeaf)(nil) 148 149 func (z zipFileLeaf) FSNodeT() {} 150 151 type zipFileLeafFile struct { 152 info fsnode.FileInfo 153 154 // semaphore guards all subsequent fields. It's used to serialize operations. 155 semaphore chan struct{} 156 // stdRAt translates context-less ReadAt requests into context-ful ones. We serialize Read 157 // requests (with semaphore) and then set stdRAt.Ctx temporarily to allow cancellation. 158 stdRAt ioctx.StdReaderAt 159 // stdRC wraps stdRAt. Its operations don't directly accept a context but are subject to 160 // cancellation indirectly via the inner stdRAt. 161 stdRC io.ReadCloser 162 // fileCloser cleans up. 163 fileCloser ioctx.Closer 164 } 165 166 func (z zipFileLeaf) OpenFile(ctx context.Context, flag int) (fsctx.File, error) { 167 biofseventlog.UsedFeature("unzipfs.open") 168 var fileEntry *zip.File 169 for _, f := range z.r.File { 170 if f.Name == z.zipName { 171 fileEntry = f 172 break 173 } 174 } 175 if fileEntry == nil { 176 return nil, errors.E(errors.NotExist, 177 fmt.Sprintf("internal inconsistency: entry %q not found in zip metadata", z.zipName)) 178 } 179 dataOffset, err := fileEntry.DataOffset() 180 if err != nil { 181 return nil, errors.E(err, fmt.Sprintf("could not get data offset for %s", fileEntry.Name)) 182 } 183 var makeDecompressor func(r io.Reader) io.ReadCloser 184 switch fileEntry.Method { 185 case zip.Store: 186 // TODO: Consider returning a ReaderAt in this case for user convenience. 187 makeDecompressor = io.NopCloser 188 case zip.Deflate: 189 makeDecompressor = flate.NewReader 190 default: 191 return nil, errors.E(errors.NotSupported, 192 fmt.Sprintf("unsupported method: %d for: %s", fileEntry.Method, fileEntry.Name)) 193 } 194 zipFile, err := fsnode.Open(ctx, z.r.leaf) 195 if err != nil { 196 return nil, err 197 } 198 rAt, ok := zipFile.(ioctx.ReaderAt) 199 if !ok { 200 err := errors.E(errors.NotSupported, fmt.Sprintf("not ReaderAt: %v", zipFile)) 201 errors.CleanUpCtx(ctx, zipFile.Close, &err) 202 return nil, err 203 } 204 f := zipFileLeafFile{ 205 info: z.FileInfo, 206 semaphore: make(chan struct{}, 1), 207 stdRAt: ioctx.StdReaderAt{Ctx: ctx, ReaderAt: rAt}, 208 fileCloser: zipFile, 209 } 210 defer func() { f.stdRAt.Ctx = nil }() 211 f.stdRC = makeDecompressor( 212 io.NewSectionReader(&f.stdRAt, dataOffset, int64(fileEntry.CompressedSize64))) 213 return &f, nil 214 } 215 216 func (f *zipFileLeafFile) Stat(context.Context) (fs.FileInfo, error) { return f.info, nil } 217 218 func (f *zipFileLeafFile) Read(ctx context.Context, dst []byte) (int, error) { 219 select { 220 case f.semaphore <- struct{}{}: 221 defer func() { _ = <-f.semaphore }() 222 case <-ctx.Done(): 223 return 0, ctx.Err() 224 } 225 226 f.stdRAt.Ctx = ctx 227 defer func() { f.stdRAt.Ctx = nil }() 228 return f.stdRC.Read(dst) 229 } 230 231 func (f *zipFileLeafFile) Close(ctx context.Context) error { 232 select { 233 case f.semaphore <- struct{}{}: 234 defer func() { _ = <-f.semaphore }() 235 case <-ctx.Done(): 236 return ctx.Err() 237 } 238 239 f.stdRAt.Ctx = ctx 240 defer func() { f.stdRAt = ioctx.StdReaderAt{} }() 241 var err error 242 if f.stdRC != nil { 243 errors.CleanUp(f.stdRC.Close, &err) 244 f.stdRC = nil 245 } 246 if f.fileCloser != nil { 247 errors.CleanUpCtx(ctx, f.fileCloser.Close, &err) 248 f.fileCloser = nil 249 } 250 return err 251 }