go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/bisection/util/bqutil/storagewrite.go (about)

     1  // Copyright 2023 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package bqutil
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"time"
    21  
    22  	"cloud.google.com/go/bigquery/storage/apiv1/storagepb"
    23  	"cloud.google.com/go/bigquery/storage/managedwriter"
    24  	"go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc"
    25  	"google.golang.org/api/option"
    26  	"google.golang.org/grpc"
    27  	"google.golang.org/grpc/keepalive"
    28  	"google.golang.org/protobuf/proto"
    29  	"google.golang.org/protobuf/types/descriptorpb"
    30  
    31  	"go.chromium.org/luci/common/errors"
    32  	"go.chromium.org/luci/grpc/grpcmon"
    33  	"go.chromium.org/luci/server/auth"
    34  )
    35  
    36  // batchMaxBytes is the maximum number of row bytes to send in one
    37  // BigQuery Storage Write API - AppendRows request. As at writing, the
    38  // request size limit for this RPC is 10 MB:
    39  // https://cloud.google.com/bigquery/quotas#write-api-limits.
    40  // The maximum size of rows must be less than this as there are
    41  // some overheads in each request.
    42  const batchMaxBytes = 9 * 1000 * 1000 // 9 MB
    43  
    44  // NewWriterClient returns a new BigQuery managedwriter client for use with the
    45  // given GCP project, that authenticates as LUCI Bisection itself.
    46  func NewWriterClient(ctx context.Context, gcpProject string) (*managedwriter.Client, error) {
    47  	// Create shared client for all writes.
    48  	// This will ensure a shared connection pool is used for all writes,
    49  	// as recommended by:
    50  	// https://cloud.google.com/bigquery/docs/write-api-best-practices#limit_the_number_of_concurrent_connections
    51  	creds, err := auth.GetPerRPCCredentials(ctx, auth.AsSelf, auth.WithScopes(auth.CloudOAuthScopes...))
    52  	if err != nil {
    53  		return nil, errors.Annotate(err, "failed to initialize credentials").Err()
    54  	}
    55  	return managedwriter.NewClient(ctx, gcpProject,
    56  		option.WithGRPCDialOption(grpc.WithStatsHandler(&grpcmon.ClientRPCStatsMonitor{})),
    57  		option.WithGRPCDialOption(grpc.WithUnaryInterceptor(otelgrpc.UnaryClientInterceptor())),
    58  		option.WithGRPCDialOption(grpc.WithStreamInterceptor(otelgrpc.StreamClientInterceptor())),
    59  		option.WithGRPCDialOption(grpc.WithPerRPCCredentials(creds)),
    60  		option.WithGRPCDialOption(grpc.WithKeepaliveParams(keepalive.ClientParameters{
    61  			Time: time.Minute,
    62  		})))
    63  }
    64  
    65  // Writer is used to export rows to BigQuery table.
    66  type Writer struct {
    67  	client                *managedwriter.Client
    68  	tableName             string
    69  	tableSchemaDescriptor *descriptorpb.DescriptorProto
    70  }
    71  
    72  // NewWriter creates a writer for exporting rows to the provided BigQuery table
    73  // via the provided managedWriter client.
    74  func NewWriter(
    75  	client *managedwriter.Client,
    76  	tableName string,
    77  	tableSchemaDescriptor *descriptorpb.DescriptorProto,
    78  ) *Writer {
    79  	return &Writer{
    80  		client:                client,
    81  		tableName:             tableName,
    82  		tableSchemaDescriptor: tableSchemaDescriptor,
    83  	}
    84  }
    85  
    86  // AppendRowsWithPendingStream append rows to BigQuery table via the pending stream.
    87  // This provides all-or-nothing semantics for insertion.
    88  func (s *Writer) AppendRowsWithPendingStream(ctx context.Context, rows []proto.Message) error {
    89  	ms, err := s.client.NewManagedStream(ctx,
    90  		managedwriter.WithType(managedwriter.PendingStream),
    91  		managedwriter.WithSchemaDescriptor(s.tableSchemaDescriptor),
    92  		managedwriter.WithDestinationTable(s.tableName))
    93  	if err != nil {
    94  		return err
    95  	}
    96  	defer ms.Close()
    97  
    98  	err = s.batchAppendRows(ctx, ms, rows)
    99  	if err != nil {
   100  		return err
   101  	}
   102  	_, err = ms.Finalize(ctx)
   103  	if err != nil {
   104  		return err
   105  	}
   106  	req := &storagepb.BatchCommitWriteStreamsRequest{
   107  		Parent:       s.tableName,
   108  		WriteStreams: []string{ms.StreamName()},
   109  	}
   110  	// Commit data atomically.
   111  	resp, err := s.client.BatchCommitWriteStreams(ctx, req)
   112  	if err != nil {
   113  		return err
   114  	}
   115  	if len(resp.StreamErrors) > 0 {
   116  		return errors.New(fmt.Sprintf("batchCommitWriteStreams error %s", resp.StreamErrors))
   117  	}
   118  	return nil
   119  }
   120  
   121  // batchAppendRows chunk rows into batches and append each batch to the provided managedStream.
   122  func (s *Writer) batchAppendRows(ctx context.Context, ms *managedwriter.ManagedStream, rows []proto.Message) error {
   123  	batches, err := batch(rows)
   124  	if err != nil {
   125  		return errors.Annotate(err, "batching rows").Err()
   126  	}
   127  	results := make([]*managedwriter.AppendResult, 0, len(batches))
   128  	for _, batch := range batches {
   129  		encoded := make([][]byte, 0, len(batch))
   130  		for _, r := range batch {
   131  			b, err := proto.Marshal(r)
   132  			if err != nil {
   133  				return errors.Annotate(err, "marshal proto").Err()
   134  			}
   135  			encoded = append(encoded, b)
   136  		}
   137  		result, err := ms.AppendRows(ctx, encoded)
   138  		if err != nil {
   139  			return errors.Annotate(err, "start appending rows").Err()
   140  		}
   141  		// Defer waiting on AppendRows until after all batches sent out.
   142  		// https://cloud.google.com/bigquery/docs/write-api-best-practices#do_not_block_on_appendrows_calls
   143  		results = append(results, result)
   144  	}
   145  	for _, result := range results {
   146  		_, err := result.GetResult(ctx)
   147  		if err != nil {
   148  			return errors.Annotate(err, "appending rows").Err()
   149  		}
   150  	}
   151  	return nil
   152  }
   153  
   154  // batch divides the rows to be inserted into batches, with each
   155  // batch having an on-the-wire size not exceeding batchMaxBytes.
   156  func batch(rows []proto.Message) ([][]proto.Message, error) {
   157  	var result [][]proto.Message
   158  
   159  	batchStartIndex := 0
   160  	batchSizeInBytes := 0
   161  	for i, row := range rows {
   162  		// Assume 16 bytes of overhead per row not captured here.
   163  		rowSize := proto.Size(row) + 16
   164  		if (batchSizeInBytes + rowSize) > batchMaxBytes {
   165  			if rowSize > batchMaxBytes {
   166  				return nil, errors.Reason("a single row exceeds the maximum BigQuery AppendRows request size of %v bytes", batchMaxBytes).Err()
   167  			}
   168  			// Output batch from batchStartIndex (inclusive) to i (exclusive).
   169  			result = append(result, rows[batchStartIndex:i])
   170  
   171  			// The current row becomes part of the next batch.
   172  			batchStartIndex = i
   173  			batchSizeInBytes = 0
   174  		}
   175  		batchSizeInBytes += rowSize
   176  	}
   177  	lastBatch := rows[batchStartIndex:]
   178  	if len(lastBatch) > 0 {
   179  		result = append(result, lastBatch)
   180  	}
   181  	return result, nil
   182  }