go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/resultdb/bqutil/storagewrite.go (about) 1 // Copyright 2024 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package bqutil 16 17 import ( 18 "context" 19 "time" 20 21 "cloud.google.com/go/bigquery/storage/managedwriter" 22 "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc" 23 "google.golang.org/api/option" 24 "google.golang.org/grpc" 25 "google.golang.org/grpc/keepalive" 26 "google.golang.org/protobuf/proto" 27 "google.golang.org/protobuf/types/descriptorpb" 28 29 "go.chromium.org/luci/common/errors" 30 "go.chromium.org/luci/common/logging" 31 "go.chromium.org/luci/grpc/grpcmon" 32 "go.chromium.org/luci/server/auth" 33 34 bqpb "go.chromium.org/luci/resultdb/proto/bq" 35 ) 36 37 // RowMaxBytes is the maximum number of row bytes to send in one 38 // BigQuery Storage Write API - AppendRows request. As at writing, the 39 // request size limit for this RPC is 10 MB: 40 // https://cloud.google.com/bigquery/quotas#write-api-limits. 41 // The maximum size of rows must be less than this as there are 42 // some overheads in each request. 43 const RowMaxBytes = 9 * 1024 * 1024 // 9 MB 44 45 var InvalidRowTagKey = errors.NewTagKey("InvalidRow") 46 47 // NewWriterClient returns a new BigQuery managedwriter client for use with the 48 // given GCP project, that authenticates as ResultDB itself. 49 func NewWriterClient(ctx context.Context, gcpProject string) (*managedwriter.Client, error) { 50 // Create shared client for all writes. 51 // This will ensure a shared connection pool is used for all writes, 52 // as recommended by: 53 // https://cloud.google.com/bigquery/docs/write-api-best-practices#limit_the_number_of_concurrent_connections 54 creds, err := auth.GetPerRPCCredentials(ctx, auth.AsSelf, auth.WithScopes(auth.CloudOAuthScopes...)) 55 if err != nil { 56 return nil, errors.Annotate(err, "failed to initialize credentials").Err() 57 } 58 return managedwriter.NewClient(ctx, gcpProject, 59 option.WithGRPCDialOption(grpc.WithStatsHandler(&grpcmon.ClientRPCStatsMonitor{})), 60 option.WithGRPCDialOption(grpc.WithUnaryInterceptor(otelgrpc.UnaryClientInterceptor())), 61 option.WithGRPCDialOption(grpc.WithStreamInterceptor(otelgrpc.StreamClientInterceptor())), 62 option.WithGRPCDialOption(grpc.WithPerRPCCredentials(creds)), 63 option.WithGRPCDialOption(grpc.WithKeepaliveParams(keepalive.ClientParameters{ 64 Time: time.Minute, 65 }))) 66 } 67 68 // Writer is used to export rows to BigQuery table. 69 type Writer struct { 70 client *managedwriter.Client 71 tableName string 72 tableSchemaDescriptor *descriptorpb.DescriptorProto 73 } 74 75 // NewWriter creates a writer for exporting rows to the provided BigQuery table 76 // via the provided managedWriter client. 77 func NewWriter( 78 client *managedwriter.Client, 79 tableName string, 80 tableSchemaDescriptor *descriptorpb.DescriptorProto, 81 ) *Writer { 82 return &Writer{ 83 client: client, 84 tableName: tableName, 85 tableSchemaDescriptor: tableSchemaDescriptor, 86 } 87 } 88 89 // AppendRowsWithDefaultStream write to the default stream. This provides at-least-one 90 // semantic (instead of exactly-one). 91 func (s *Writer) AppendRowsWithDefaultStream(ctx context.Context, rows []proto.Message) error { 92 ms, err := s.client.NewManagedStream(ctx, 93 managedwriter.WithType(managedwriter.DefaultStream), 94 managedwriter.WithSchemaDescriptor(s.tableSchemaDescriptor), 95 managedwriter.WithDestinationTable(s.tableName)) 96 if err != nil { 97 return err 98 } 99 defer ms.Close() 100 101 return s.batchAppendRows(ctx, ms, rows) 102 } 103 104 // batchAppendRows chunk rows into batches and append each batch to the provided managedStream. 105 func (s *Writer) batchAppendRows(ctx context.Context, ms *managedwriter.ManagedStream, rows []proto.Message) error { 106 batches, err := batch(rows) 107 if err != nil { 108 return errors.Annotate(err, "batching rows").Tag(errors.BoolTag{Key: InvalidRowTagKey}).Err() 109 } 110 results := make([]*managedwriter.AppendResult, 0, len(batches)) 111 for _, batch := range batches { 112 encoded := make([][]byte, 0, len(batch)) 113 for _, r := range batch { 114 b, err := proto.Marshal(r) 115 if err != nil { 116 // Some artifact rows failed to be marshalled for some reasons, logging will give more info. 117 // TODO (nqmtuan): Remove this log when we find out the reason. 118 if artifactRow, ok := r.(*bqpb.TextArtifactRow); ok { 119 logging.Errorf(ctx, "Marshal failed for artifact row. Inv ID: %s. Test ID: %s. Artifact ID: %s. Shard ID: %d.", artifactRow.InvocationId, artifactRow.TestId, artifactRow.ArtifactId, artifactRow.ShardId) 120 } 121 return errors.Annotate(err, "marshal proto").Tag(errors.BoolTag{Key: InvalidRowTagKey}).Err() 122 } 123 encoded = append(encoded, b) 124 } 125 result, err := ms.AppendRows(ctx, encoded) 126 if err != nil { 127 return errors.Annotate(err, "start appending rows").Err() 128 } 129 // Defer waiting on AppendRows until after all batches sent out. 130 // https://cloud.google.com/bigquery/docs/write-api-best-practices#do_not_block_on_appendrows_calls 131 results = append(results, result) 132 } 133 for _, result := range results { 134 _, err := result.GetResult(ctx) 135 if err != nil { 136 return errors.Annotate(err, "appending rows").Err() 137 } 138 } 139 return nil 140 } 141 142 // batch divides the rows to be inserted into batches, with each 143 // batch having an on-the-wire size not exceeding batchMaxBytes. 144 func batch(rows []proto.Message) ([][]proto.Message, error) { 145 var result [][]proto.Message 146 147 batchStartIndex := 0 148 batchSizeInBytes := 0 149 for i, row := range rows { 150 // Assume 16 bytes of overhead per row not captured here. 151 rowSize := proto.Size(row) + 16 152 if (batchSizeInBytes + rowSize) > RowMaxBytes { 153 if rowSize > RowMaxBytes { 154 return nil, errors.Reason("a single row exceeds the maximum BigQuery AppendRows request size of %v bytes", RowMaxBytes).Err() 155 } 156 // Output batch from batchStartIndex (inclusive) to i (exclusive). 157 result = append(result, rows[batchStartIndex:i]) 158 159 // The current row becomes part of the next batch. 160 batchStartIndex = i 161 batchSizeInBytes = 0 162 } 163 batchSizeInBytes += rowSize 164 } 165 lastBatch := rows[batchStartIndex:] 166 if len(lastBatch) > 0 { 167 result = append(result, lastBatch) 168 } 169 return result, nil 170 }