github.com/grailbio/bigslice@v0.0.0-20230519005545-30c4c12152ad/archive/tarslice/tarslice.go (about)

     1  // Copyright 2019 GRAIL, Inc. All rights reserved.
     2  // Use of this source code is governed by the Apache 2.0
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package tarslice implements bigslice operations for reading tar archives.
     6  package tarslice
     7  
     8  import (
     9  	"archive/tar"
    10  	"io"
    11  	"io/ioutil"
    12  
    13  	"github.com/grailbio/bigslice"
    14  	"github.com/grailbio/bigslice/sliceio"
    15  )
    16  
    17  // Entry describes a single tar file entry, including its full contents.
    18  type Entry struct {
    19  	// Header is the full tar header.
    20  	tar.Header
    21  	// Body is the file contents.
    22  	Body []byte
    23  }
    24  
    25  // Reader returns a slice of Entry records representing the tar
    26  // archive of the io.ReadCloser returned by the archive func. Slices
    27  // are sharded nshard ways, striped across entries. Note that the
    28  // archive is read fully for each shard produced.
    29  func Reader(nshard int, archive func() (io.ReadCloser, error)) bigslice.Slice {
    30  	bigslice.Helper()
    31  	type state struct {
    32  		*tar.Reader
    33  		io.Closer
    34  	}
    35  	return bigslice.ReaderFunc(nshard, func(shard int, state *state, entries []Entry) (n int, err error) {
    36  		first := state.Reader == nil
    37  		defer func() {
    38  			if err != nil && state.Closer != nil {
    39  				state.Close()
    40  			}
    41  		}()
    42  		if first {
    43  			rc, err := archive()
    44  			if err != nil {
    45  				return 0, err
    46  			}
    47  			state.Reader = tar.NewReader(rc)
    48  			state.Closer = rc
    49  			if err := skip(state.Reader, shard); err != nil {
    50  				return 0, err
    51  			}
    52  		}
    53  		for i := range entries {
    54  			if !first || i > 0 {
    55  				if err := skip(state.Reader, nshard-1); err != nil {
    56  					return i, err
    57  				}
    58  			}
    59  			head, err := state.Next()
    60  			if err != nil {
    61  				if err == io.EOF {
    62  					err = sliceio.EOF
    63  				}
    64  				return i, err
    65  			}
    66  			entries[i].Header = *head
    67  			entries[i].Body, err = ioutil.ReadAll(state)
    68  			if err != nil {
    69  				return i, err
    70  			}
    71  		}
    72  		return len(entries), nil
    73  	})
    74  }
    75  
    76  func skip(r *tar.Reader, n int) error {
    77  	for i := 0; i < n; i++ {
    78  		_, err := r.Next()
    79  		if err == io.EOF {
    80  			return sliceio.EOF
    81  		}
    82  	}
    83  	return nil
    84  }