github.com/grailbio/bigslice@v0.0.0-20230519005545-30c4c12152ad/archive/tarslice/tarslice.go (about) 1 // Copyright 2019 GRAIL, Inc. All rights reserved. 2 // Use of this source code is governed by the Apache 2.0 3 // license that can be found in the LICENSE file. 4 5 // Package tarslice implements bigslice operations for reading tar archives. 6 package tarslice 7 8 import ( 9 "archive/tar" 10 "io" 11 "io/ioutil" 12 13 "github.com/grailbio/bigslice" 14 "github.com/grailbio/bigslice/sliceio" 15 ) 16 17 // Entry describes a single tar file entry, including its full contents. 18 type Entry struct { 19 // Header is the full tar header. 20 tar.Header 21 // Body is the file contents. 22 Body []byte 23 } 24 25 // Reader returns a slice of Entry records representing the tar 26 // archive of the io.ReadCloser returned by the archive func. Slices 27 // are sharded nshard ways, striped across entries. Note that the 28 // archive is read fully for each shard produced. 29 func Reader(nshard int, archive func() (io.ReadCloser, error)) bigslice.Slice { 30 bigslice.Helper() 31 type state struct { 32 *tar.Reader 33 io.Closer 34 } 35 return bigslice.ReaderFunc(nshard, func(shard int, state *state, entries []Entry) (n int, err error) { 36 first := state.Reader == nil 37 defer func() { 38 if err != nil && state.Closer != nil { 39 state.Close() 40 } 41 }() 42 if first { 43 rc, err := archive() 44 if err != nil { 45 return 0, err 46 } 47 state.Reader = tar.NewReader(rc) 48 state.Closer = rc 49 if err := skip(state.Reader, shard); err != nil { 50 return 0, err 51 } 52 } 53 for i := range entries { 54 if !first || i > 0 { 55 if err := skip(state.Reader, nshard-1); err != nil { 56 return i, err 57 } 58 } 59 head, err := state.Next() 60 if err != nil { 61 if err == io.EOF { 62 err = sliceio.EOF 63 } 64 return i, err 65 } 66 entries[i].Header = *head 67 entries[i].Body, err = ioutil.ReadAll(state) 68 if err != nil { 69 return i, err 70 } 71 } 72 return len(entries), nil 73 }) 74 } 75 76 func skip(r *tar.Reader, n int) error { 77 for i := 0; i < n; i++ { 78 _, err := r.Next() 79 if err == io.EOF { 80 return sliceio.EOF 81 } 82 } 83 return nil 84 }