go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/bisection/util/bqutil/storagewrite.go (about) 1 // Copyright 2023 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package bqutil 16 17 import ( 18 "context" 19 "fmt" 20 "time" 21 22 "cloud.google.com/go/bigquery/storage/apiv1/storagepb" 23 "cloud.google.com/go/bigquery/storage/managedwriter" 24 "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc" 25 "google.golang.org/api/option" 26 "google.golang.org/grpc" 27 "google.golang.org/grpc/keepalive" 28 "google.golang.org/protobuf/proto" 29 "google.golang.org/protobuf/types/descriptorpb" 30 31 "go.chromium.org/luci/common/errors" 32 "go.chromium.org/luci/grpc/grpcmon" 33 "go.chromium.org/luci/server/auth" 34 ) 35 36 // batchMaxBytes is the maximum number of row bytes to send in one 37 // BigQuery Storage Write API - AppendRows request. As at writing, the 38 // request size limit for this RPC is 10 MB: 39 // https://cloud.google.com/bigquery/quotas#write-api-limits. 40 // The maximum size of rows must be less than this as there are 41 // some overheads in each request. 42 const batchMaxBytes = 9 * 1000 * 1000 // 9 MB 43 44 // NewWriterClient returns a new BigQuery managedwriter client for use with the 45 // given GCP project, that authenticates as LUCI Bisection itself. 46 func NewWriterClient(ctx context.Context, gcpProject string) (*managedwriter.Client, error) { 47 // Create shared client for all writes. 48 // This will ensure a shared connection pool is used for all writes, 49 // as recommended by: 50 // https://cloud.google.com/bigquery/docs/write-api-best-practices#limit_the_number_of_concurrent_connections 51 creds, err := auth.GetPerRPCCredentials(ctx, auth.AsSelf, auth.WithScopes(auth.CloudOAuthScopes...)) 52 if err != nil { 53 return nil, errors.Annotate(err, "failed to initialize credentials").Err() 54 } 55 return managedwriter.NewClient(ctx, gcpProject, 56 option.WithGRPCDialOption(grpc.WithStatsHandler(&grpcmon.ClientRPCStatsMonitor{})), 57 option.WithGRPCDialOption(grpc.WithUnaryInterceptor(otelgrpc.UnaryClientInterceptor())), 58 option.WithGRPCDialOption(grpc.WithStreamInterceptor(otelgrpc.StreamClientInterceptor())), 59 option.WithGRPCDialOption(grpc.WithPerRPCCredentials(creds)), 60 option.WithGRPCDialOption(grpc.WithKeepaliveParams(keepalive.ClientParameters{ 61 Time: time.Minute, 62 }))) 63 } 64 65 // Writer is used to export rows to BigQuery table. 66 type Writer struct { 67 client *managedwriter.Client 68 tableName string 69 tableSchemaDescriptor *descriptorpb.DescriptorProto 70 } 71 72 // NewWriter creates a writer for exporting rows to the provided BigQuery table 73 // via the provided managedWriter client. 74 func NewWriter( 75 client *managedwriter.Client, 76 tableName string, 77 tableSchemaDescriptor *descriptorpb.DescriptorProto, 78 ) *Writer { 79 return &Writer{ 80 client: client, 81 tableName: tableName, 82 tableSchemaDescriptor: tableSchemaDescriptor, 83 } 84 } 85 86 // AppendRowsWithPendingStream append rows to BigQuery table via the pending stream. 87 // This provides all-or-nothing semantics for insertion. 88 func (s *Writer) AppendRowsWithPendingStream(ctx context.Context, rows []proto.Message) error { 89 ms, err := s.client.NewManagedStream(ctx, 90 managedwriter.WithType(managedwriter.PendingStream), 91 managedwriter.WithSchemaDescriptor(s.tableSchemaDescriptor), 92 managedwriter.WithDestinationTable(s.tableName)) 93 if err != nil { 94 return err 95 } 96 defer ms.Close() 97 98 err = s.batchAppendRows(ctx, ms, rows) 99 if err != nil { 100 return err 101 } 102 _, err = ms.Finalize(ctx) 103 if err != nil { 104 return err 105 } 106 req := &storagepb.BatchCommitWriteStreamsRequest{ 107 Parent: s.tableName, 108 WriteStreams: []string{ms.StreamName()}, 109 } 110 // Commit data atomically. 111 resp, err := s.client.BatchCommitWriteStreams(ctx, req) 112 if err != nil { 113 return err 114 } 115 if len(resp.StreamErrors) > 0 { 116 return errors.New(fmt.Sprintf("batchCommitWriteStreams error %s", resp.StreamErrors)) 117 } 118 return nil 119 } 120 121 // batchAppendRows chunk rows into batches and append each batch to the provided managedStream. 122 func (s *Writer) batchAppendRows(ctx context.Context, ms *managedwriter.ManagedStream, rows []proto.Message) error { 123 batches, err := batch(rows) 124 if err != nil { 125 return errors.Annotate(err, "batching rows").Err() 126 } 127 results := make([]*managedwriter.AppendResult, 0, len(batches)) 128 for _, batch := range batches { 129 encoded := make([][]byte, 0, len(batch)) 130 for _, r := range batch { 131 b, err := proto.Marshal(r) 132 if err != nil { 133 return errors.Annotate(err, "marshal proto").Err() 134 } 135 encoded = append(encoded, b) 136 } 137 result, err := ms.AppendRows(ctx, encoded) 138 if err != nil { 139 return errors.Annotate(err, "start appending rows").Err() 140 } 141 // Defer waiting on AppendRows until after all batches sent out. 142 // https://cloud.google.com/bigquery/docs/write-api-best-practices#do_not_block_on_appendrows_calls 143 results = append(results, result) 144 } 145 for _, result := range results { 146 _, err := result.GetResult(ctx) 147 if err != nil { 148 return errors.Annotate(err, "appending rows").Err() 149 } 150 } 151 return nil 152 } 153 154 // batch divides the rows to be inserted into batches, with each 155 // batch having an on-the-wire size not exceeding batchMaxBytes. 156 func batch(rows []proto.Message) ([][]proto.Message, error) { 157 var result [][]proto.Message 158 159 batchStartIndex := 0 160 batchSizeInBytes := 0 161 for i, row := range rows { 162 // Assume 16 bytes of overhead per row not captured here. 163 rowSize := proto.Size(row) + 16 164 if (batchSizeInBytes + rowSize) > batchMaxBytes { 165 if rowSize > batchMaxBytes { 166 return nil, errors.Reason("a single row exceeds the maximum BigQuery AppendRows request size of %v bytes", batchMaxBytes).Err() 167 } 168 // Output batch from batchStartIndex (inclusive) to i (exclusive). 169 result = append(result, rows[batchStartIndex:i]) 170 171 // The current row becomes part of the next batch. 172 batchStartIndex = i 173 batchSizeInBytes = 0 174 } 175 batchSizeInBytes += rowSize 176 } 177 lastBatch := rows[batchStartIndex:] 178 if len(lastBatch) > 0 { 179 result = append(result, lastBatch) 180 } 181 return result, nil 182 }