github.com/bazelbuild/remote-apis-sdks@v0.0.0-20240425170053-8a36686a6350/go/pkg/chunker/chunker.go (about)

     1  // Package chunker provides a way to chunk an input into uploadable-size byte slices.
     2  package chunker
     3  
     4  import (
     5  	"fmt"
     6  	"io"
     7  
     8  	"github.com/klauspost/compress/zstd"
     9  	"github.com/pkg/errors"
    10  
    11  	"github.com/bazelbuild/remote-apis-sdks/go/pkg/reader"
    12  	"github.com/bazelbuild/remote-apis-sdks/go/pkg/uploadinfo"
    13  )
    14  
    15  // DefaultChunkSize is the default chunk size for ByteStream.Write RPCs.
    16  const DefaultChunkSize = 1024 * 1024
    17  
    18  // IOBufferSize regulates how many bytes at a time the Chunker will read from a file source.
    19  var IOBufferSize = 10 * 1024 * 1024
    20  
    21  // ErrEOF is returned when Next is called when HasNext is false.
    22  var ErrEOF = errors.New("ErrEOF")
    23  
    24  // Compressor for full blobs
    25  // It is *only* thread-safe for EncodeAll calls and should not be used for streamed compression.
    26  // While we avoid sending 0 len blobs, we do want to create zero len compressed blobs if
    27  // necessary.
    28  var fullCompressor, _ = zstd.NewWriter(nil, zstd.WithZeroFrames(true))
    29  
    30  // Chunker can be used to chunk an input into uploadable-size byte slices.
    31  // A single Chunker is NOT thread-safe; it should be used by a single uploader thread.
    32  type Chunker struct {
    33  	chunkSize int
    34  	r         reader.ReadSeeker
    35  	// An optional cache of the full data. It will be present in these cases:
    36  	// * The Chunker was initialized from a []byte.
    37  	// * Chunker.FullData was called at least once.
    38  	// * Next() was called and the read was less than IOBufferSize.
    39  	// Once contents are initialized, they are immutable.
    40  	contents   []byte
    41  	offset     int64
    42  	reachedEOF bool
    43  
    44  	ue *uploadinfo.Entry
    45  }
    46  
    47  // New creates a new chunker from an uploadinfo.Entry.
    48  // If compressed, the data will of the Entry will be compressed on the fly.
    49  func New(ue *uploadinfo.Entry, compressed bool, chunkSize int) (*Chunker, error) {
    50  	if chunkSize < 1 {
    51  		chunkSize = DefaultChunkSize
    52  	}
    53  	var c *Chunker
    54  	if ue.IsBlob() {
    55  		contents := make([]byte, len(ue.Contents))
    56  		copy(contents, ue.Contents)
    57  		if compressed {
    58  			contents = fullCompressor.EncodeAll(contents, nil)
    59  		}
    60  		c = &Chunker{
    61  			contents: contents,
    62  		}
    63  	} else if ue.IsFile() {
    64  		r := reader.NewFileReadSeeker(ue.Path, IOBufferSize)
    65  		if compressed {
    66  			var err error
    67  			r, err = reader.NewCompressedSeeker(r)
    68  			if err != nil {
    69  				return nil, err
    70  			}
    71  		}
    72  		c = &Chunker{
    73  			r: r,
    74  		}
    75  
    76  		if chunkSize > IOBufferSize {
    77  			chunkSize = IOBufferSize
    78  		}
    79  	} else {
    80  		return nil, errors.New("invalid Entry")
    81  	}
    82  
    83  	c.chunkSize = chunkSize
    84  	c.ue = ue
    85  	return c, nil
    86  }
    87  
    88  // String returns an identifiable representation of the Chunker.
    89  func (c *Chunker) String() string {
    90  	size := fmt.Sprintf("<%d bytes>", c.ue.Digest.Size)
    91  	if !c.ue.IsFile() {
    92  		return size
    93  	}
    94  	return fmt.Sprintf("%s: %s", size, c.ue.Path)
    95  }
    96  
    97  // Offset returns the current Chunker offset.
    98  func (c *Chunker) Offset() int64 {
    99  	return c.offset
   100  }
   101  
   102  // ChunkSize returns the maximum size of each chunk.
   103  func (c *Chunker) ChunkSize() int {
   104  	return c.chunkSize
   105  }
   106  
   107  // Reset the Chunker state to when it was newly constructed.
   108  // Useful for upload retries.
   109  // TODO(olaola): implement Seek(offset) when we have resumable uploads.
   110  func (c *Chunker) Reset() error {
   111  	if c.r != nil {
   112  		if err := c.r.SeekOffset(0); err != nil {
   113  			return errors.Wrapf(err, "failed to call SeekOffset(0) for %s", c.ue.Path)
   114  		}
   115  	}
   116  	c.offset = 0
   117  	c.reachedEOF = false
   118  	return nil
   119  }
   120  
   121  // FullData returns the overall (non-chunked) underlying data. The Chunker is Reset.
   122  // It is supposed to be used for batch uploading small inputs.
   123  func (c *Chunker) FullData() ([]byte, error) {
   124  	if err := c.Reset(); err != nil {
   125  		return nil, err
   126  	}
   127  	if c.contents != nil {
   128  		return c.contents, nil
   129  	}
   130  	var err error
   131  	if !c.r.IsInitialized() {
   132  		err = c.r.Initialize()
   133  	}
   134  	if err != nil {
   135  		c.r.Close() // Free file handle in case of error.
   136  		return nil, err
   137  	}
   138  	// Cache contents so that the next call to FullData() doesn't result in file read.
   139  	c.contents, err = io.ReadAll(c.r)
   140  	c.r.Close()
   141  	return c.contents, err
   142  }
   143  
   144  // HasNext returns whether a subsequent call to Next will return a valid chunk. Always true for a
   145  // newly created Chunker.
   146  func (c *Chunker) HasNext() bool {
   147  	return !c.reachedEOF
   148  }
   149  
   150  // Chunk is a piece of a byte[] blob suitable for being uploaded.
   151  type Chunk struct {
   152  	Offset int64
   153  	Data   []byte
   154  }
   155  
   156  // Next returns the next chunk of data or error. ErrEOF is returned if and only if HasNext is false.
   157  // Chunk.Data will be empty if and only if the full underlying data is empty (in which case it will
   158  // be the only chunk returned). Chunk.Digest will only be filled for the first chunk.
   159  func (c *Chunker) Next() (*Chunk, error) {
   160  	if !c.HasNext() {
   161  		return nil, ErrEOF
   162  	}
   163  	if c.ue.Digest.Size == 0 {
   164  		c.reachedEOF = true
   165  		return &Chunk{}, nil
   166  	}
   167  
   168  	var data []byte
   169  	if c.contents != nil {
   170  		// As long as we have data in memory, it's much more efficient to return
   171  		// a view slice than to copy it around. Contents are immutable so it's okay
   172  		// to return the slice.
   173  		endRead := int(c.offset) + c.chunkSize
   174  		if endRead >= len(c.contents) {
   175  			endRead = len(c.contents)
   176  			c.reachedEOF = true
   177  		}
   178  		data = c.contents[c.offset:endRead]
   179  	} else {
   180  		if !c.r.IsInitialized() {
   181  			err := c.r.Initialize()
   182  			if err != nil {
   183  				return nil, err
   184  			}
   185  		}
   186  
   187  		// We don't need to check the amount of bytes read, as ReadFull will yell if
   188  		// it's diff than len(data).
   189  		data = make([]byte, c.chunkSize)
   190  		n, err := io.ReadFull(c.r, data)
   191  		data = data[:n]
   192  		// Cache the contents to avoid further IO for small files.
   193  		if err == io.ErrUnexpectedEOF || err == io.EOF {
   194  			if c.offset == 0 {
   195  				c.contents = data
   196  			}
   197  			c.reachedEOF = true
   198  			c.r.Close()
   199  		} else if err != nil {
   200  			c.r.Close() // Free the file handle in case of error.
   201  			return nil, err
   202  		}
   203  	}
   204  
   205  	res := &Chunk{
   206  		Offset: c.offset,
   207  		Data:   data,
   208  	}
   209  	c.offset += int64(len(data))
   210  	return res, nil
   211  }