github.com/bazelbuild/remote-apis-sdks@v0.0.0-20240425170053-8a36686a6350/go/pkg/cas/client.go (about) 1 // Package cas implements an efficient client for Content Addressable Storage. 2 package cas 3 4 import ( 5 "bufio" 6 "bytes" 7 "context" 8 "fmt" 9 "sync" 10 "time" 11 12 // Redundant imports are required for the google3 mirror. Aliases should not be changed. 13 "github.com/pkg/errors" 14 "golang.org/x/sync/semaphore" 15 bsgrpc "google.golang.org/genproto/googleapis/bytestream" 16 "google.golang.org/grpc" 17 18 "github.com/bazelbuild/remote-apis-sdks/go/pkg/digest" 19 "github.com/bazelbuild/remote-apis-sdks/go/pkg/retry" 20 regrpc "github.com/bazelbuild/remote-apis/build/bazel/remote/execution/v2" 21 repb "github.com/bazelbuild/remote-apis/build/bazel/remote/execution/v2" 22 ) 23 24 // Client is a client for Content Addressable Storage. 25 // Create one using NewClient. 26 // 27 // Goroutine-safe. 28 // 29 // All fields are considered immutable, and should not be changed. 30 type Client struct { 31 conn *grpc.ClientConn 32 // InstanceName is the full name of the RBE instance. 33 InstanceName string 34 35 // Config is the configuration that the client was created with. 36 Config ClientConfig 37 38 byteStream bsgrpc.ByteStreamClient 39 cas regrpc.ContentAddressableStorageClient 40 41 // per-RPC semaphores 42 43 semFindMissingBlobs *semaphore.Weighted 44 semBatchUpdateBlobs *semaphore.Weighted 45 semByteStreamWrite *semaphore.Weighted 46 47 // TODO(nodir): ensure it does not hurt streaming. 48 semFileIO *semaphore.Weighted 49 // semLargeFile ensures only few large files are read/written at a time. 50 // TODO(nodir): ensure this doesn't hurt performance on SSDs. 51 semLargeFile *semaphore.Weighted 52 53 // fileBufReaders is a pool of reusable *bufio.Readers 54 // with buffer size = ClientConfig.FileIOSize, so e.g. 4MiB. 55 // Use fileBufReaders.Get(), then reset the reader with bufio.Reader.Reset, 56 // and put back to the when done. 57 fileBufReaders sync.Pool 58 59 // streamBufs is a pool of []byte slices used for ByteStream read/write RPCs. 60 streamBufs sync.Pool 61 62 // Mockable functions. 63 64 testScheduleCheck func(ctx context.Context, item *uploadItem) error 65 } 66 67 // ClientConfig is a config for Client. 68 // See DefaultClientConfig() for the default values. 69 type ClientConfig struct { 70 // FSConcurrency is the maximum number of concurrent file system operations. 71 // TODO(nodir): ensure this does not hurt streaming performance 72 FSConcurrency int 73 74 // FSLargeConcurrency is the maximum number of concurrent large file read operation. 75 FSLargeConcurrency int 76 77 // SmallFileThreshold is a size threshold to categorize a file as small. 78 // Such files are buffered entirely (read only once). 79 SmallFileThreshold int64 80 81 // LargeFileThreshold is a size threshold to categorize a file as large. For 82 // such files, IO concurrency limits are much tighter and locality is 83 // prioritized: the file is read for the first and second times with minimal 84 // delay between the two. 85 LargeFileThreshold int64 86 87 // FileIOSize is the size of file reads. 88 FileIOSize int64 89 90 // CompressedBytestreamThreshold is the minimum blob size to enable compression 91 // in ByteStream RPCs. 92 // Use 0 for all writes being compressed, and a negative number for all operations being 93 // uncompressed. 94 // DefaultClientConfig() disables compression by default. 95 CompressedBytestreamThreshold int64 96 97 // FindMissingBlobs is configuration for ContentAddressableStorage.FindMissingBlobs RPCs. 98 // FindMissingBlobs.MaxSizeBytes is ignored. 99 FindMissingBlobs RPCConfig 100 101 // BatchUpdateBlobs is configuration for ContentAddressableStorage.BatchUpdateBlobs RPCs. 102 BatchUpdateBlobs RPCConfig 103 104 // ByteStreamWrite is configuration for ByteStream.Write RPCs. 105 // ByteStreamWrite.MaxItems is ignored. 106 ByteStreamWrite RPCConfig 107 108 // RetryPolicy specifies how to retry requests on transient errors. 109 RetryPolicy retry.BackoffPolicy 110 111 // IgnoreCapabilities specifies whether to ignore server-provided capabilities. 112 // Capabilities are consulted by default. 113 IgnoreCapabilities bool 114 } 115 116 // RPCConfig is configuration for a particular CAS RPC. 117 // Some of the fields might not apply to certain RPCs. 118 // 119 // For streaming RPCs, the values apply to individual requests/responses in a 120 // stream, not the entire stream. 121 type RPCConfig struct { 122 // Concurrency is the maximum number of RPCs in flight. 123 Concurrency int 124 125 // MaxSizeBytes is the maximum size of the request/response, in bytes. 126 MaxSizeBytes int 127 128 // MaxItems is the maximum number of blobs/digests per RPC. 129 // Applies only to batch RPCs, such as FindMissingBlobs. 130 MaxItems int 131 132 // Timeout is the maximum duration of the RPC. 133 Timeout time.Duration 134 } 135 136 // DefaultClientConfig returns the default config. 137 // 138 // To override a specific value: 139 // 140 // cfg := DefaultClientConfig() 141 // ... mutate cfg ... 142 // client, err := NewClientWithConfig(ctx, cfg) 143 func DefaultClientConfig() ClientConfig { 144 return ClientConfig{ 145 // GCE docs recommend at least 32 concurrent IOs. 146 // https://cloud.google.com/compute/docs/disks/optimizing-pd-performance#io-queue-depth 147 // TODO(nodir): tune this number. 148 FSConcurrency: 32, 149 150 FSLargeConcurrency: 2, 151 152 SmallFileThreshold: 1024 * 1024, // 1MiB 153 LargeFileThreshold: 256 * 1024 * 1024, // 256MiB 154 155 // GCE docs recommend 4MB IO size for large files. 156 // https://cloud.google.com/compute/docs/disks/optimizing-pd-performance#io-size 157 FileIOSize: 4 * 1024 * 1024, // 4MiB 158 159 FindMissingBlobs: RPCConfig{ 160 Concurrency: 256, // Should be >= BatchUpdateBlobs.Concurrency. 161 MaxItems: 1000, 162 Timeout: time.Minute, 163 }, 164 BatchUpdateBlobs: RPCConfig{ 165 Concurrency: 256, 166 167 // This is a suggested approximate limit based on current RBE implementation for writes. 168 // Above that BatchUpdateBlobs calls start to exceed a typical minute timeout. 169 // This default might not be best for reads though. 170 MaxItems: 4000, 171 // 4MiB is the default gRPC request size limit. 172 MaxSizeBytes: 4 * 1024 * 1024, 173 Timeout: time.Minute, 174 }, 175 ByteStreamWrite: RPCConfig{ 176 Concurrency: 256, 177 // 4MiB is the default gRPC request size limit. 178 MaxSizeBytes: 4 * 1024 * 1024, 179 Timeout: time.Minute, 180 }, 181 182 // Disable compression by default. 183 CompressedBytestreamThreshold: -1, 184 185 RetryPolicy: retry.ExponentialBackoff(225*time.Millisecond, 2*time.Second, retry.Attempts(6)), 186 } 187 } 188 189 // Validate returns a non-nil error if the config is invalid. 190 func (c *ClientConfig) Validate() error { 191 switch { 192 case c.FSConcurrency <= 0: 193 return fmt.Errorf("FSConcurrency must be positive") 194 195 case c.SmallFileThreshold < 0: 196 return fmt.Errorf("SmallFileThreshold must be non-negative") 197 case c.LargeFileThreshold <= 0: 198 return fmt.Errorf("LargeFileThreshold must be positive") 199 case c.SmallFileThreshold >= c.LargeFileThreshold: 200 return fmt.Errorf("SmallFileThreshold must be smaller than LargeFileThreshold") 201 202 case c.FileIOSize <= 0: 203 return fmt.Errorf("FileIOSize must be positive") 204 205 // Checking more than 100K blobs may run into the request size limits. 206 // It does not really make sense to check even >10K blobs, so limit to 10k. 207 case c.FindMissingBlobs.MaxItems > 10000: 208 return fmt.Errorf("FindMissingBlobs.MaxItems must <= 10000") 209 } 210 211 if err := c.FindMissingBlobs.validate(); err != nil { 212 return errors.Wrap(err, "FindMissingBlobs") 213 } 214 if err := c.BatchUpdateBlobs.validate(); err != nil { 215 return errors.Wrap(err, "BatchUpdateBlobs") 216 } 217 if err := c.ByteStreamWrite.validate(); err != nil { 218 return errors.Wrap(err, "BatchUpdateBlobs") 219 } 220 return nil 221 } 222 223 // validate returns an error if the config is invalid. 224 func (c *RPCConfig) validate() error { 225 switch { 226 case c.Concurrency <= 0: 227 return fmt.Errorf("Concurrency must be positive") 228 case c.Timeout <= 0: 229 return fmt.Errorf("Timeout must be positive") 230 default: 231 return nil 232 } 233 } 234 235 // NewClient creates a new client with the default configuration. 236 // Use client.Dial to create a connection. 237 func NewClient(ctx context.Context, conn *grpc.ClientConn, instanceName string) (*Client, error) { 238 return NewClientWithConfig(ctx, conn, instanceName, DefaultClientConfig()) 239 } 240 241 // NewClientWithConfig creates a new client and accepts a configuration. 242 func NewClientWithConfig(ctx context.Context, conn *grpc.ClientConn, instanceName string, config ClientConfig) (*Client, error) { 243 switch err := config.Validate(); { 244 case err != nil: 245 return nil, errors.Wrap(err, "invalid config") 246 case conn == nil: 247 return nil, fmt.Errorf("conn is unspecified") 248 case instanceName == "": 249 return nil, fmt.Errorf("instance name is unspecified") 250 } 251 252 client := &Client{ 253 InstanceName: instanceName, 254 Config: config, 255 conn: conn, 256 byteStream: bsgrpc.NewByteStreamClient(conn), 257 cas: regrpc.NewContentAddressableStorageClient(conn), 258 } 259 if !client.Config.IgnoreCapabilities { 260 if err := client.checkCapabilities(ctx); err != nil { 261 return nil, errors.Wrapf(err, "checking capabilities") 262 } 263 } 264 265 client.init() 266 267 return client, nil 268 } 269 270 var emptyReader = bytes.NewReader(nil) 271 272 // init is a part of NewClientWithConfig that can be done in tests without 273 // creating a real gRPC connection. This function exists purely to aid testing, 274 // and is tightly coupled with NewClientWithConfig. 275 func (c *Client) init() { 276 c.semFindMissingBlobs = semaphore.NewWeighted(int64(c.Config.FindMissingBlobs.Concurrency)) 277 c.semBatchUpdateBlobs = semaphore.NewWeighted(int64(c.Config.BatchUpdateBlobs.Concurrency)) 278 c.semByteStreamWrite = semaphore.NewWeighted(int64(c.Config.ByteStreamWrite.Concurrency)) 279 280 c.semFileIO = semaphore.NewWeighted(int64(c.Config.FSConcurrency)) 281 c.semLargeFile = semaphore.NewWeighted(int64(c.Config.FSLargeConcurrency)) 282 c.fileBufReaders.New = func() interface{} { 283 return bufio.NewReaderSize(emptyReader, int(c.Config.FileIOSize)) 284 } 285 286 streamBufSize := 32 * 1024 // by default, send 32KiB chunks. 287 if streamBufSize < c.Config.ByteStreamWrite.MaxSizeBytes { 288 streamBufSize = int(c.Config.ByteStreamWrite.MaxSizeBytes) 289 } 290 c.streamBufs.New = func() interface{} { 291 buf := make([]byte, streamBufSize) 292 return &buf 293 } 294 } 295 296 // unaryRPC calls f with retries, and with per-RPC timeouts. 297 // Does not limit concurrency. 298 // It is useful when f calls an unary RPC. 299 func (c *Client) unaryRPC(ctx context.Context, cfg *RPCConfig, f func(context.Context) error) error { 300 return c.withRetries(ctx, func(ctx context.Context) error { 301 ctx, cancel := context.WithTimeout(ctx, cfg.Timeout) 302 defer cancel() 303 return f(ctx) 304 }) 305 } 306 307 func (c *Client) withRetries(ctx context.Context, f func(context.Context) error) error { 308 return retry.WithPolicy(ctx, retry.TransientOnly, c.Config.RetryPolicy, func() error { 309 return f(ctx) 310 }) 311 } 312 313 // checkCapabilities consults with server-side capabilities and potentially 314 // mutates c.ClientConfig. 315 func (c *Client) checkCapabilities(ctx context.Context) error { 316 caps, err := regrpc.NewCapabilitiesClient(c.conn).GetCapabilities(ctx, &repb.GetCapabilitiesRequest{InstanceName: c.InstanceName}) 317 if err != nil { 318 return errors.Wrapf(err, "GetCapabilities RPC") 319 } 320 321 if err := digest.CheckCapabilities(caps); err != nil { 322 return errors.Wrapf(err, "digest function mismatch") 323 } 324 325 if c.Config.BatchUpdateBlobs.MaxSizeBytes > int(caps.CacheCapabilities.MaxBatchTotalSizeBytes) { 326 c.Config.BatchUpdateBlobs.MaxSizeBytes = int(caps.CacheCapabilities.MaxBatchTotalSizeBytes) 327 } 328 329 // TODO(nodir): check compression capabilities. 330 331 return nil 332 } 333 334 // withPerCallTimeout returns a function wrapper that cancels the context if 335 // fn does not return within the timeout. 336 func withPerCallTimeout(ctx context.Context, timeout time.Duration) (context.Context, context.CancelFunc, func(fn func())) { 337 ctx, cancel := context.WithCancel(ctx) 338 return ctx, cancel, func(fn func()) { 339 stop := make(chan struct{}) 340 defer close(stop) 341 go func() { 342 select { 343 case <-time.After(timeout): 344 cancel() 345 case <-stop: 346 } 347 }() 348 fn() 349 } 350 }