github.com/bazelbuild/remote-apis-sdks@v0.0.0-20240425170053-8a36686a6350/go/pkg/cas/client.go (about)

     1  // Package cas implements an efficient client for Content Addressable Storage.
     2  package cas
     3  
     4  import (
     5  	"bufio"
     6  	"bytes"
     7  	"context"
     8  	"fmt"
     9  	"sync"
    10  	"time"
    11  
    12  	// Redundant imports are required for the google3 mirror. Aliases should not be changed.
    13  	"github.com/pkg/errors"
    14  	"golang.org/x/sync/semaphore"
    15  	bsgrpc "google.golang.org/genproto/googleapis/bytestream"
    16  	"google.golang.org/grpc"
    17  
    18  	"github.com/bazelbuild/remote-apis-sdks/go/pkg/digest"
    19  	"github.com/bazelbuild/remote-apis-sdks/go/pkg/retry"
    20  	regrpc "github.com/bazelbuild/remote-apis/build/bazel/remote/execution/v2"
    21  	repb "github.com/bazelbuild/remote-apis/build/bazel/remote/execution/v2"
    22  )
    23  
    24  // Client is a client for Content Addressable Storage.
    25  // Create one using NewClient.
    26  //
    27  // Goroutine-safe.
    28  //
    29  // All fields are considered immutable, and should not be changed.
    30  type Client struct {
    31  	conn *grpc.ClientConn
    32  	// InstanceName is the full name of the RBE instance.
    33  	InstanceName string
    34  
    35  	// Config is the configuration that the client was created with.
    36  	Config ClientConfig
    37  
    38  	byteStream bsgrpc.ByteStreamClient
    39  	cas        regrpc.ContentAddressableStorageClient
    40  
    41  	// per-RPC semaphores
    42  
    43  	semFindMissingBlobs *semaphore.Weighted
    44  	semBatchUpdateBlobs *semaphore.Weighted
    45  	semByteStreamWrite  *semaphore.Weighted
    46  
    47  	// TODO(nodir): ensure it does not hurt streaming.
    48  	semFileIO *semaphore.Weighted
    49  	// semLargeFile ensures only few large files are read/written at a time.
    50  	// TODO(nodir): ensure this doesn't hurt performance on SSDs.
    51  	semLargeFile *semaphore.Weighted
    52  
    53  	// fileBufReaders is a pool of reusable *bufio.Readers
    54  	// with buffer size = ClientConfig.FileIOSize, so e.g. 4MiB.
    55  	// Use fileBufReaders.Get(), then reset the reader with bufio.Reader.Reset,
    56  	// and put back to the when done.
    57  	fileBufReaders sync.Pool
    58  
    59  	// streamBufs is a pool of []byte slices used for ByteStream read/write RPCs.
    60  	streamBufs sync.Pool
    61  
    62  	// Mockable functions.
    63  
    64  	testScheduleCheck func(ctx context.Context, item *uploadItem) error
    65  }
    66  
    67  // ClientConfig is a config for Client.
    68  // See DefaultClientConfig() for the default values.
    69  type ClientConfig struct {
    70  	// FSConcurrency is the maximum number of concurrent file system operations.
    71  	// TODO(nodir): ensure this does not hurt streaming performance
    72  	FSConcurrency int
    73  
    74  	// FSLargeConcurrency is the maximum number of concurrent large file read operation.
    75  	FSLargeConcurrency int
    76  
    77  	// SmallFileThreshold is a size threshold to categorize a file as small.
    78  	// Such files are buffered entirely (read only once).
    79  	SmallFileThreshold int64
    80  
    81  	// LargeFileThreshold is a size threshold to categorize a file as large. For
    82  	// such files, IO concurrency limits are much tighter and locality is
    83  	// prioritized: the file is read for the first and second times with minimal
    84  	// delay between the two.
    85  	LargeFileThreshold int64
    86  
    87  	// FileIOSize is the size of file reads.
    88  	FileIOSize int64
    89  
    90  	// CompressedBytestreamThreshold is the minimum blob size to enable compression
    91  	// in ByteStream RPCs.
    92  	// Use 0 for all writes being compressed, and a negative number for all operations being
    93  	// uncompressed.
    94  	// DefaultClientConfig() disables compression by default.
    95  	CompressedBytestreamThreshold int64
    96  
    97  	// FindMissingBlobs is configuration for ContentAddressableStorage.FindMissingBlobs RPCs.
    98  	// FindMissingBlobs.MaxSizeBytes is ignored.
    99  	FindMissingBlobs RPCConfig
   100  
   101  	// BatchUpdateBlobs is configuration for ContentAddressableStorage.BatchUpdateBlobs RPCs.
   102  	BatchUpdateBlobs RPCConfig
   103  
   104  	// ByteStreamWrite is configuration for ByteStream.Write RPCs.
   105  	// ByteStreamWrite.MaxItems is ignored.
   106  	ByteStreamWrite RPCConfig
   107  
   108  	// RetryPolicy specifies how to retry requests on transient errors.
   109  	RetryPolicy retry.BackoffPolicy
   110  
   111  	// IgnoreCapabilities specifies whether to ignore server-provided capabilities.
   112  	// Capabilities are consulted by default.
   113  	IgnoreCapabilities bool
   114  }
   115  
   116  // RPCConfig is configuration for a particular CAS RPC.
   117  // Some of the fields might not apply to certain RPCs.
   118  //
   119  // For streaming RPCs, the values apply to individual requests/responses in a
   120  // stream, not the entire stream.
   121  type RPCConfig struct {
   122  	// Concurrency is the maximum number of RPCs in flight.
   123  	Concurrency int
   124  
   125  	// MaxSizeBytes is the maximum size of the request/response, in bytes.
   126  	MaxSizeBytes int
   127  
   128  	// MaxItems is the maximum number of blobs/digests per RPC.
   129  	// Applies only to batch RPCs, such as FindMissingBlobs.
   130  	MaxItems int
   131  
   132  	// Timeout is the maximum duration of the RPC.
   133  	Timeout time.Duration
   134  }
   135  
   136  // DefaultClientConfig returns the default config.
   137  //
   138  // To override a specific value:
   139  //
   140  //	cfg := DefaultClientConfig()
   141  //	... mutate cfg ...
   142  //	client, err := NewClientWithConfig(ctx, cfg)
   143  func DefaultClientConfig() ClientConfig {
   144  	return ClientConfig{
   145  		// GCE docs recommend at least 32 concurrent IOs.
   146  		// https://cloud.google.com/compute/docs/disks/optimizing-pd-performance#io-queue-depth
   147  		// TODO(nodir): tune this number.
   148  		FSConcurrency: 32,
   149  
   150  		FSLargeConcurrency: 2,
   151  
   152  		SmallFileThreshold: 1024 * 1024,       // 1MiB
   153  		LargeFileThreshold: 256 * 1024 * 1024, // 256MiB
   154  
   155  		// GCE docs recommend 4MB IO size for large files.
   156  		// https://cloud.google.com/compute/docs/disks/optimizing-pd-performance#io-size
   157  		FileIOSize: 4 * 1024 * 1024, // 4MiB
   158  
   159  		FindMissingBlobs: RPCConfig{
   160  			Concurrency: 256, // Should be >= BatchUpdateBlobs.Concurrency.
   161  			MaxItems:    1000,
   162  			Timeout:     time.Minute,
   163  		},
   164  		BatchUpdateBlobs: RPCConfig{
   165  			Concurrency: 256,
   166  
   167  			// This is a suggested approximate limit based on current RBE implementation for writes.
   168  			// Above that BatchUpdateBlobs calls start to exceed a typical minute timeout.
   169  			// This default might not be best for reads though.
   170  			MaxItems: 4000,
   171  			// 4MiB is the default gRPC request size limit.
   172  			MaxSizeBytes: 4 * 1024 * 1024,
   173  			Timeout:      time.Minute,
   174  		},
   175  		ByteStreamWrite: RPCConfig{
   176  			Concurrency: 256,
   177  			// 4MiB is the default gRPC request size limit.
   178  			MaxSizeBytes: 4 * 1024 * 1024,
   179  			Timeout:      time.Minute,
   180  		},
   181  
   182  		// Disable compression by default.
   183  		CompressedBytestreamThreshold: -1,
   184  
   185  		RetryPolicy: retry.ExponentialBackoff(225*time.Millisecond, 2*time.Second, retry.Attempts(6)),
   186  	}
   187  }
   188  
   189  // Validate returns a non-nil error if the config is invalid.
   190  func (c *ClientConfig) Validate() error {
   191  	switch {
   192  	case c.FSConcurrency <= 0:
   193  		return fmt.Errorf("FSConcurrency must be positive")
   194  
   195  	case c.SmallFileThreshold < 0:
   196  		return fmt.Errorf("SmallFileThreshold must be non-negative")
   197  	case c.LargeFileThreshold <= 0:
   198  		return fmt.Errorf("LargeFileThreshold must be positive")
   199  	case c.SmallFileThreshold >= c.LargeFileThreshold:
   200  		return fmt.Errorf("SmallFileThreshold must be smaller than LargeFileThreshold")
   201  
   202  	case c.FileIOSize <= 0:
   203  		return fmt.Errorf("FileIOSize must be positive")
   204  
   205  	// Checking more than 100K blobs may run into the request size limits.
   206  	// It does not really make sense to check even >10K blobs, so limit to 10k.
   207  	case c.FindMissingBlobs.MaxItems > 10000:
   208  		return fmt.Errorf("FindMissingBlobs.MaxItems must <= 10000")
   209  	}
   210  
   211  	if err := c.FindMissingBlobs.validate(); err != nil {
   212  		return errors.Wrap(err, "FindMissingBlobs")
   213  	}
   214  	if err := c.BatchUpdateBlobs.validate(); err != nil {
   215  		return errors.Wrap(err, "BatchUpdateBlobs")
   216  	}
   217  	if err := c.ByteStreamWrite.validate(); err != nil {
   218  		return errors.Wrap(err, "BatchUpdateBlobs")
   219  	}
   220  	return nil
   221  }
   222  
   223  // validate returns an error if the config is invalid.
   224  func (c *RPCConfig) validate() error {
   225  	switch {
   226  	case c.Concurrency <= 0:
   227  		return fmt.Errorf("Concurrency must be positive")
   228  	case c.Timeout <= 0:
   229  		return fmt.Errorf("Timeout must be positive")
   230  	default:
   231  		return nil
   232  	}
   233  }
   234  
   235  // NewClient creates a new client with the default configuration.
   236  // Use client.Dial to create a connection.
   237  func NewClient(ctx context.Context, conn *grpc.ClientConn, instanceName string) (*Client, error) {
   238  	return NewClientWithConfig(ctx, conn, instanceName, DefaultClientConfig())
   239  }
   240  
   241  // NewClientWithConfig creates a new client and accepts a configuration.
   242  func NewClientWithConfig(ctx context.Context, conn *grpc.ClientConn, instanceName string, config ClientConfig) (*Client, error) {
   243  	switch err := config.Validate(); {
   244  	case err != nil:
   245  		return nil, errors.Wrap(err, "invalid config")
   246  	case conn == nil:
   247  		return nil, fmt.Errorf("conn is unspecified")
   248  	case instanceName == "":
   249  		return nil, fmt.Errorf("instance name is unspecified")
   250  	}
   251  
   252  	client := &Client{
   253  		InstanceName: instanceName,
   254  		Config:       config,
   255  		conn:         conn,
   256  		byteStream:   bsgrpc.NewByteStreamClient(conn),
   257  		cas:          regrpc.NewContentAddressableStorageClient(conn),
   258  	}
   259  	if !client.Config.IgnoreCapabilities {
   260  		if err := client.checkCapabilities(ctx); err != nil {
   261  			return nil, errors.Wrapf(err, "checking capabilities")
   262  		}
   263  	}
   264  
   265  	client.init()
   266  
   267  	return client, nil
   268  }
   269  
   270  var emptyReader = bytes.NewReader(nil)
   271  
   272  // init is a part of NewClientWithConfig that can be done in tests without
   273  // creating a real gRPC connection. This function exists purely to aid testing,
   274  // and is tightly coupled with NewClientWithConfig.
   275  func (c *Client) init() {
   276  	c.semFindMissingBlobs = semaphore.NewWeighted(int64(c.Config.FindMissingBlobs.Concurrency))
   277  	c.semBatchUpdateBlobs = semaphore.NewWeighted(int64(c.Config.BatchUpdateBlobs.Concurrency))
   278  	c.semByteStreamWrite = semaphore.NewWeighted(int64(c.Config.ByteStreamWrite.Concurrency))
   279  
   280  	c.semFileIO = semaphore.NewWeighted(int64(c.Config.FSConcurrency))
   281  	c.semLargeFile = semaphore.NewWeighted(int64(c.Config.FSLargeConcurrency))
   282  	c.fileBufReaders.New = func() interface{} {
   283  		return bufio.NewReaderSize(emptyReader, int(c.Config.FileIOSize))
   284  	}
   285  
   286  	streamBufSize := 32 * 1024 // by default, send 32KiB chunks.
   287  	if streamBufSize < c.Config.ByteStreamWrite.MaxSizeBytes {
   288  		streamBufSize = int(c.Config.ByteStreamWrite.MaxSizeBytes)
   289  	}
   290  	c.streamBufs.New = func() interface{} {
   291  		buf := make([]byte, streamBufSize)
   292  		return &buf
   293  	}
   294  }
   295  
   296  // unaryRPC calls f with retries, and with per-RPC timeouts.
   297  // Does not limit concurrency.
   298  // It is useful when f calls an unary RPC.
   299  func (c *Client) unaryRPC(ctx context.Context, cfg *RPCConfig, f func(context.Context) error) error {
   300  	return c.withRetries(ctx, func(ctx context.Context) error {
   301  		ctx, cancel := context.WithTimeout(ctx, cfg.Timeout)
   302  		defer cancel()
   303  		return f(ctx)
   304  	})
   305  }
   306  
   307  func (c *Client) withRetries(ctx context.Context, f func(context.Context) error) error {
   308  	return retry.WithPolicy(ctx, retry.TransientOnly, c.Config.RetryPolicy, func() error {
   309  		return f(ctx)
   310  	})
   311  }
   312  
   313  // checkCapabilities consults with server-side capabilities and potentially
   314  // mutates c.ClientConfig.
   315  func (c *Client) checkCapabilities(ctx context.Context) error {
   316  	caps, err := regrpc.NewCapabilitiesClient(c.conn).GetCapabilities(ctx, &repb.GetCapabilitiesRequest{InstanceName: c.InstanceName})
   317  	if err != nil {
   318  		return errors.Wrapf(err, "GetCapabilities RPC")
   319  	}
   320  
   321  	if err := digest.CheckCapabilities(caps); err != nil {
   322  		return errors.Wrapf(err, "digest function mismatch")
   323  	}
   324  
   325  	if c.Config.BatchUpdateBlobs.MaxSizeBytes > int(caps.CacheCapabilities.MaxBatchTotalSizeBytes) {
   326  		c.Config.BatchUpdateBlobs.MaxSizeBytes = int(caps.CacheCapabilities.MaxBatchTotalSizeBytes)
   327  	}
   328  
   329  	// TODO(nodir): check compression capabilities.
   330  
   331  	return nil
   332  }
   333  
   334  // withPerCallTimeout returns a function wrapper that cancels the context if
   335  // fn does not return within the timeout.
   336  func withPerCallTimeout(ctx context.Context, timeout time.Duration) (context.Context, context.CancelFunc, func(fn func())) {
   337  	ctx, cancel := context.WithCancel(ctx)
   338  	return ctx, cancel, func(fn func()) {
   339  		stop := make(chan struct{})
   340  		defer close(stop)
   341  		go func() {
   342  			select {
   343  			case <-time.After(timeout):
   344  				cancel()
   345  			case <-stop:
   346  			}
   347  		}()
   348  		fn()
   349  	}
   350  }