github.com/grailbio/base@v0.0.11/file/addfs/unzipfs/unzipfs.go (about)

     1  package unzipfs
     2  
     3  import (
     4  	"archive/zip"
     5  	"compress/flate"
     6  	"context"
     7  	stderrors "errors"
     8  	"fmt"
     9  	"io"
    10  	"io/fs"
    11  	"path"
    12  	"runtime"
    13  
    14  	"github.com/grailbio/base/errors"
    15  	"github.com/grailbio/base/file/addfs"
    16  	"github.com/grailbio/base/file/fsnode"
    17  	"github.com/grailbio/base/grail/biofs/biofseventlog"
    18  	"github.com/grailbio/base/ioctx"
    19  	"github.com/grailbio/base/ioctx/fsctx"
    20  	"github.com/grailbio/base/log"
    21  	"github.com/grailbio/base/morebufio"
    22  	"github.com/grailbio/base/sync/loadingcache"
    23  )
    24  
    25  type unzipFunc struct{}
    26  
    27  // Func is an addfs.PerNodeFunc that presents zip file contents as a subdirectory tree.
    28  // Users can access contents in .../myfile.zip/unzip/, for example.
    29  //
    30  // The file need not have extension .zip. Func.Apply reads the file header and if it's not
    31  // a supported zip file the unzip/ directory is omitted.
    32  var Func unzipFunc
    33  
    34  var _ addfs.PerNodeFunc = Func
    35  
    36  func (unzipFunc) Apply(ctx context.Context, node fsnode.T) ([]fsnode.T, error) {
    37  	zipLeaf, ok := node.(fsnode.Leaf)
    38  	if !ok {
    39  		return nil, nil
    40  	}
    41  	info := fsnode.NewDirInfo("unzip").WithCacheableFor(fsnode.CacheableFor(zipLeaf))
    42  	parent, err := parentFromLeaf(ctx, info, zipLeaf)
    43  	if err != nil {
    44  		return nil, err
    45  	}
    46  	if parent == nil {
    47  		return nil, nil
    48  	}
    49  	return []fsnode.T{parent}, nil
    50  }
    51  
    52  type readerHandle struct {
    53  	*zip.Reader
    54  	ioctx.Closer
    55  	leaf fsnode.Leaf
    56  }
    57  
    58  func finalizeHandle(h *readerHandle) {
    59  	if err := h.Close(context.Background()); err != nil {
    60  		log.Error.Printf("unzipfs: error closing handle: %v", err)
    61  	}
    62  }
    63  
    64  // parentFromLeaf opens zipLeaf to determine if it's a zip file and returns a Parent if so.
    65  // Returns nil, nil in cases where the file is not supported (like, not a zip file).
    66  // TODO: Consider exposing more public APIs like this fsnode.Leaf -> fsnode.Parent and/or
    67  // *zip.Reader -> fsnode.Parent.
    68  func parentFromLeaf(ctx context.Context, parentInfo fsnode.FileInfo, zipLeaf fsnode.Leaf) (fsnode.Parent, error) {
    69  	zipFile, err := fsnode.Open(ctx, zipLeaf)
    70  	if err != nil {
    71  		return nil, errors.E(err, "opening for unzip")
    72  	}
    73  	handle := readerHandle{Closer: zipFile, leaf: zipLeaf}
    74  	// TODO: More reliable/explicit cleanup. Refcount?
    75  	runtime.SetFinalizer(&handle, finalizeHandle)
    76  	info, err := zipFile.Stat(ctx)
    77  	if err != nil {
    78  		return nil, errors.E(err, "stat-ing for unzip")
    79  	}
    80  	rAt, ok := zipFile.(ioctx.ReaderAt)
    81  	if !ok {
    82  		log.Info.Printf("zipfs: random access not supported: %s, returning empty dir", zipLeaf.Info().Name())
    83  		// TODO: Some less efficient fallback path? Try seeking?
    84  		return nil, nil
    85  	}
    86  	// Buffer makes header read much faster when underlying file is high latency, like S3.
    87  	// Of course there's a tradeoff where very small zip files (with much smaller headers) will
    88  	// not be read as lazily, but the speedup is significant for S3.
    89  	rAt = morebufio.NewReaderAtSize(rAt, 1024*1024)
    90  	handle.Reader, err = zip.NewReader(ioctx.ToStdReaderAt(ctx, rAt), info.Size())
    91  	if err != nil {
    92  		if stderrors.Is(err, zip.ErrFormat) ||
    93  			stderrors.Is(err, zip.ErrAlgorithm) ||
    94  			stderrors.Is(err, zip.ErrChecksum) {
    95  			log.Info.Printf("zipfs: not a valid zip file: %s, returning empty dir", zipLeaf.Info().Name())
    96  			return nil, nil
    97  		}
    98  		return nil, errors.E(err, "initializing zip reader")
    99  	}
   100  	return fsnode.NewParent(parentInfo, &handleChildGen{r: &handle, pathPrefix: "."}), nil
   101  }
   102  
   103  type handleChildGen struct {
   104  	r          *readerHandle
   105  	pathPrefix string
   106  	children   loadingcache.Value
   107  }
   108  
   109  func (g *handleChildGen) GenerateChildren(ctx context.Context) ([]fsnode.T, error) {
   110  	biofseventlog.UsedFeature("unzipfs.children")
   111  	var children []fsnode.T
   112  	err := g.children.GetOrLoad(ctx, &children, func(ctx context.Context, opts *loadingcache.LoadOpts) error {
   113  		entries, err := fs.ReadDir(g.r, g.pathPrefix)
   114  		if err != nil {
   115  			return err
   116  		}
   117  		children = make([]fsnode.T, len(entries))
   118  		cacheFor := fsnode.CacheableFor(g.r.leaf)
   119  		for i, entry := range entries {
   120  			stat, err := entry.Info() // Immediate (no additional file read) as of go1.17.
   121  			if err != nil {
   122  				return errors.E(err, fmt.Sprintf("stat: %s", entry.Name()))
   123  			}
   124  			childInfo := fsnode.CopyFileInfo(stat).WithCacheableFor(cacheFor)
   125  			fullName := path.Join(g.pathPrefix, entry.Name())
   126  			if entry.IsDir() {
   127  				children[i] = fsnode.NewParent(childInfo, &handleChildGen{r: g.r, pathPrefix: fullName})
   128  			} else {
   129  				children[i] = zipFileLeaf{g.r, childInfo, fullName}
   130  			}
   131  		}
   132  		opts.CacheFor(cacheFor)
   133  		return nil
   134  	})
   135  	if err != nil {
   136  		return nil, errors.E(err, fmt.Sprintf("listing path: %s", g.pathPrefix))
   137  	}
   138  	return children, nil
   139  }
   140  
   141  type zipFileLeaf struct {
   142  	r *readerHandle
   143  	fsnode.FileInfo
   144  	zipName string
   145  }
   146  
   147  var _ fsnode.Leaf = (*zipFileLeaf)(nil)
   148  
   149  func (z zipFileLeaf) FSNodeT() {}
   150  
   151  type zipFileLeafFile struct {
   152  	info fsnode.FileInfo
   153  
   154  	// semaphore guards all subsequent fields. It's used to serialize operations.
   155  	semaphore chan struct{}
   156  	// stdRAt translates context-less ReadAt requests into context-ful ones. We serialize Read
   157  	// requests (with semaphore) and then set stdRAt.Ctx temporarily to allow cancellation.
   158  	stdRAt ioctx.StdReaderAt
   159  	// stdRC wraps stdRAt. Its operations don't directly accept a context but are subject to
   160  	// cancellation indirectly via the inner stdRAt.
   161  	stdRC io.ReadCloser
   162  	// fileCloser cleans up.
   163  	fileCloser ioctx.Closer
   164  }
   165  
   166  func (z zipFileLeaf) OpenFile(ctx context.Context, flag int) (fsctx.File, error) {
   167  	biofseventlog.UsedFeature("unzipfs.open")
   168  	var fileEntry *zip.File
   169  	for _, f := range z.r.File {
   170  		if f.Name == z.zipName {
   171  			fileEntry = f
   172  			break
   173  		}
   174  	}
   175  	if fileEntry == nil {
   176  		return nil, errors.E(errors.NotExist,
   177  			fmt.Sprintf("internal inconsistency: entry %q not found in zip metadata", z.zipName))
   178  	}
   179  	dataOffset, err := fileEntry.DataOffset()
   180  	if err != nil {
   181  		return nil, errors.E(err, fmt.Sprintf("could not get data offset for %s", fileEntry.Name))
   182  	}
   183  	var makeDecompressor func(r io.Reader) io.ReadCloser
   184  	switch fileEntry.Method {
   185  	case zip.Store:
   186  		// TODO: Consider returning a ReaderAt in this case for user convenience.
   187  		makeDecompressor = io.NopCloser
   188  	case zip.Deflate:
   189  		makeDecompressor = flate.NewReader
   190  	default:
   191  		return nil, errors.E(errors.NotSupported,
   192  			fmt.Sprintf("unsupported method: %d for: %s", fileEntry.Method, fileEntry.Name))
   193  	}
   194  	zipFile, err := fsnode.Open(ctx, z.r.leaf)
   195  	if err != nil {
   196  		return nil, err
   197  	}
   198  	rAt, ok := zipFile.(ioctx.ReaderAt)
   199  	if !ok {
   200  		err := errors.E(errors.NotSupported, fmt.Sprintf("not ReaderAt: %v", zipFile))
   201  		errors.CleanUpCtx(ctx, zipFile.Close, &err)
   202  		return nil, err
   203  	}
   204  	f := zipFileLeafFile{
   205  		info:       z.FileInfo,
   206  		semaphore:  make(chan struct{}, 1),
   207  		stdRAt:     ioctx.StdReaderAt{Ctx: ctx, ReaderAt: rAt},
   208  		fileCloser: zipFile,
   209  	}
   210  	defer func() { f.stdRAt.Ctx = nil }()
   211  	f.stdRC = makeDecompressor(
   212  		io.NewSectionReader(&f.stdRAt, dataOffset, int64(fileEntry.CompressedSize64)))
   213  	return &f, nil
   214  }
   215  
   216  func (f *zipFileLeafFile) Stat(context.Context) (fs.FileInfo, error) { return f.info, nil }
   217  
   218  func (f *zipFileLeafFile) Read(ctx context.Context, dst []byte) (int, error) {
   219  	select {
   220  	case f.semaphore <- struct{}{}:
   221  		defer func() { _ = <-f.semaphore }()
   222  	case <-ctx.Done():
   223  		return 0, ctx.Err()
   224  	}
   225  
   226  	f.stdRAt.Ctx = ctx
   227  	defer func() { f.stdRAt.Ctx = nil }()
   228  	return f.stdRC.Read(dst)
   229  }
   230  
   231  func (f *zipFileLeafFile) Close(ctx context.Context) error {
   232  	select {
   233  	case f.semaphore <- struct{}{}:
   234  		defer func() { _ = <-f.semaphore }()
   235  	case <-ctx.Done():
   236  		return ctx.Err()
   237  	}
   238  
   239  	f.stdRAt.Ctx = ctx
   240  	defer func() { f.stdRAt = ioctx.StdReaderAt{} }()
   241  	var err error
   242  	if f.stdRC != nil {
   243  		errors.CleanUp(f.stdRC.Close, &err)
   244  		f.stdRC = nil
   245  	}
   246  	if f.fileCloser != nil {
   247  		errors.CleanUpCtx(ctx, f.fileCloser.Close, &err)
   248  		f.fileCloser = nil
   249  	}
   250  	return err
   251  }