github.com/bazelbuild/remote-apis-sdks@v0.0.0-20240425170053-8a36686a6350/go/pkg/chunker/chunker.go (about) 1 // Package chunker provides a way to chunk an input into uploadable-size byte slices. 2 package chunker 3 4 import ( 5 "fmt" 6 "io" 7 8 "github.com/klauspost/compress/zstd" 9 "github.com/pkg/errors" 10 11 "github.com/bazelbuild/remote-apis-sdks/go/pkg/reader" 12 "github.com/bazelbuild/remote-apis-sdks/go/pkg/uploadinfo" 13 ) 14 15 // DefaultChunkSize is the default chunk size for ByteStream.Write RPCs. 16 const DefaultChunkSize = 1024 * 1024 17 18 // IOBufferSize regulates how many bytes at a time the Chunker will read from a file source. 19 var IOBufferSize = 10 * 1024 * 1024 20 21 // ErrEOF is returned when Next is called when HasNext is false. 22 var ErrEOF = errors.New("ErrEOF") 23 24 // Compressor for full blobs 25 // It is *only* thread-safe for EncodeAll calls and should not be used for streamed compression. 26 // While we avoid sending 0 len blobs, we do want to create zero len compressed blobs if 27 // necessary. 28 var fullCompressor, _ = zstd.NewWriter(nil, zstd.WithZeroFrames(true)) 29 30 // Chunker can be used to chunk an input into uploadable-size byte slices. 31 // A single Chunker is NOT thread-safe; it should be used by a single uploader thread. 32 type Chunker struct { 33 chunkSize int 34 r reader.ReadSeeker 35 // An optional cache of the full data. It will be present in these cases: 36 // * The Chunker was initialized from a []byte. 37 // * Chunker.FullData was called at least once. 38 // * Next() was called and the read was less than IOBufferSize. 39 // Once contents are initialized, they are immutable. 40 contents []byte 41 offset int64 42 reachedEOF bool 43 44 ue *uploadinfo.Entry 45 } 46 47 // New creates a new chunker from an uploadinfo.Entry. 48 // If compressed, the data will of the Entry will be compressed on the fly. 49 func New(ue *uploadinfo.Entry, compressed bool, chunkSize int) (*Chunker, error) { 50 if chunkSize < 1 { 51 chunkSize = DefaultChunkSize 52 } 53 var c *Chunker 54 if ue.IsBlob() { 55 contents := make([]byte, len(ue.Contents)) 56 copy(contents, ue.Contents) 57 if compressed { 58 contents = fullCompressor.EncodeAll(contents, nil) 59 } 60 c = &Chunker{ 61 contents: contents, 62 } 63 } else if ue.IsFile() { 64 r := reader.NewFileReadSeeker(ue.Path, IOBufferSize) 65 if compressed { 66 var err error 67 r, err = reader.NewCompressedSeeker(r) 68 if err != nil { 69 return nil, err 70 } 71 } 72 c = &Chunker{ 73 r: r, 74 } 75 76 if chunkSize > IOBufferSize { 77 chunkSize = IOBufferSize 78 } 79 } else { 80 return nil, errors.New("invalid Entry") 81 } 82 83 c.chunkSize = chunkSize 84 c.ue = ue 85 return c, nil 86 } 87 88 // String returns an identifiable representation of the Chunker. 89 func (c *Chunker) String() string { 90 size := fmt.Sprintf("<%d bytes>", c.ue.Digest.Size) 91 if !c.ue.IsFile() { 92 return size 93 } 94 return fmt.Sprintf("%s: %s", size, c.ue.Path) 95 } 96 97 // Offset returns the current Chunker offset. 98 func (c *Chunker) Offset() int64 { 99 return c.offset 100 } 101 102 // ChunkSize returns the maximum size of each chunk. 103 func (c *Chunker) ChunkSize() int { 104 return c.chunkSize 105 } 106 107 // Reset the Chunker state to when it was newly constructed. 108 // Useful for upload retries. 109 // TODO(olaola): implement Seek(offset) when we have resumable uploads. 110 func (c *Chunker) Reset() error { 111 if c.r != nil { 112 if err := c.r.SeekOffset(0); err != nil { 113 return errors.Wrapf(err, "failed to call SeekOffset(0) for %s", c.ue.Path) 114 } 115 } 116 c.offset = 0 117 c.reachedEOF = false 118 return nil 119 } 120 121 // FullData returns the overall (non-chunked) underlying data. The Chunker is Reset. 122 // It is supposed to be used for batch uploading small inputs. 123 func (c *Chunker) FullData() ([]byte, error) { 124 if err := c.Reset(); err != nil { 125 return nil, err 126 } 127 if c.contents != nil { 128 return c.contents, nil 129 } 130 var err error 131 if !c.r.IsInitialized() { 132 err = c.r.Initialize() 133 } 134 if err != nil { 135 c.r.Close() // Free file handle in case of error. 136 return nil, err 137 } 138 // Cache contents so that the next call to FullData() doesn't result in file read. 139 c.contents, err = io.ReadAll(c.r) 140 c.r.Close() 141 return c.contents, err 142 } 143 144 // HasNext returns whether a subsequent call to Next will return a valid chunk. Always true for a 145 // newly created Chunker. 146 func (c *Chunker) HasNext() bool { 147 return !c.reachedEOF 148 } 149 150 // Chunk is a piece of a byte[] blob suitable for being uploaded. 151 type Chunk struct { 152 Offset int64 153 Data []byte 154 } 155 156 // Next returns the next chunk of data or error. ErrEOF is returned if and only if HasNext is false. 157 // Chunk.Data will be empty if and only if the full underlying data is empty (in which case it will 158 // be the only chunk returned). Chunk.Digest will only be filled for the first chunk. 159 func (c *Chunker) Next() (*Chunk, error) { 160 if !c.HasNext() { 161 return nil, ErrEOF 162 } 163 if c.ue.Digest.Size == 0 { 164 c.reachedEOF = true 165 return &Chunk{}, nil 166 } 167 168 var data []byte 169 if c.contents != nil { 170 // As long as we have data in memory, it's much more efficient to return 171 // a view slice than to copy it around. Contents are immutable so it's okay 172 // to return the slice. 173 endRead := int(c.offset) + c.chunkSize 174 if endRead >= len(c.contents) { 175 endRead = len(c.contents) 176 c.reachedEOF = true 177 } 178 data = c.contents[c.offset:endRead] 179 } else { 180 if !c.r.IsInitialized() { 181 err := c.r.Initialize() 182 if err != nil { 183 return nil, err 184 } 185 } 186 187 // We don't need to check the amount of bytes read, as ReadFull will yell if 188 // it's diff than len(data). 189 data = make([]byte, c.chunkSize) 190 n, err := io.ReadFull(c.r, data) 191 data = data[:n] 192 // Cache the contents to avoid further IO for small files. 193 if err == io.ErrUnexpectedEOF || err == io.EOF { 194 if c.offset == 0 { 195 c.contents = data 196 } 197 c.reachedEOF = true 198 c.r.Close() 199 } else if err != nil { 200 c.r.Close() // Free the file handle in case of error. 201 return nil, err 202 } 203 } 204 205 res := &Chunk{ 206 Offset: c.offset, 207 Data: data, 208 } 209 c.offset += int64(len(data)) 210 return res, nil 211 }