go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/resultdb/internal/services/bqexporter/text_artifact_row.go (about) 1 // Copyright 2021 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package bqexporter 16 17 import ( 18 "bufio" 19 "bytes" 20 "context" 21 "fmt" 22 "io" 23 "strings" 24 25 "cloud.google.com/go/bigquery" 26 "github.com/golang/protobuf/descriptor" 27 desc "github.com/golang/protobuf/protoc-gen-go/descriptor" 28 "golang.org/x/sync/errgroup" 29 "google.golang.org/protobuf/proto" 30 31 "go.chromium.org/luci/common/bq" 32 "go.chromium.org/luci/common/errors" 33 "go.chromium.org/luci/common/logging" 34 "go.chromium.org/luci/server/span" 35 36 "go.chromium.org/luci/resultdb/bqutil" 37 "go.chromium.org/luci/resultdb/internal/artifactcontent" 38 "go.chromium.org/luci/resultdb/internal/artifacts" 39 "go.chromium.org/luci/resultdb/internal/invocations" 40 "go.chromium.org/luci/resultdb/internal/invocations/graph" 41 "go.chromium.org/luci/resultdb/pbutil" 42 bqpb "go.chromium.org/luci/resultdb/proto/bq" 43 pb "go.chromium.org/luci/resultdb/proto/v1" 44 ) 45 46 var textArtifactRowSchema bigquery.Schema 47 48 const ( 49 artifactRowMessage = "luci.resultdb.bq.TextArtifactRowLegacy" 50 51 // Row size limit is 5MB according to 52 // https://cloud.google.com/bigquery/quotas#streaming_inserts 53 // Split artifact content into 4MB shards if it's too large. 54 contentShardSize = 4e6 55 56 // Number of workers to download artifact content. 57 artifactWorkers = 10 58 ) 59 60 func init() { 61 var err error 62 if textArtifactRowSchema, err = generateArtifactRowSchema(); err != nil { 63 panic(err) 64 } 65 } 66 67 func generateArtifactRowSchema() (schema bigquery.Schema, err error) { 68 fd, _ := descriptor.MessageDescriptorProto(&bqpb.TextArtifactRowLegacy{}) 69 fdinv, _ := descriptor.MessageDescriptorProto(&bqpb.InvocationRecord{}) 70 fdsp, _ := descriptor.MessageDescriptorProto(&pb.StringPair{}) 71 fdset := &desc.FileDescriptorSet{File: []*desc.FileDescriptorProto{fd, fdinv, fdsp}} 72 return bqutil.GenerateSchema(fdset, artifactRowMessage) 73 } 74 75 // textArtifactRowInput is information required to generate a text artifact BigQuery row. 76 type textArtifactRowInput struct { 77 exported *pb.Invocation 78 parent *pb.Invocation 79 a *pb.Artifact 80 shardID int32 81 content string 82 } 83 84 func (i *textArtifactRowInput) row() proto.Message { 85 _, testID, resultID, artifactID := artifacts.MustParseName(i.a.Name) 86 expRec := invocationProtoToRecord(i.exported) 87 parRec := invocationProtoToRecord(i.parent) 88 89 return &bqpb.TextArtifactRowLegacy{ 90 Exported: expRec, 91 Parent: parRec, 92 TestId: testID, 93 ResultId: resultID, 94 ArtifactId: artifactID, 95 ShardId: i.shardID, 96 Content: i.content, 97 PartitionTime: i.exported.CreateTime, 98 } 99 } 100 101 func (i *textArtifactRowInput) id() []byte { 102 return []byte(fmt.Sprintf("%s/%d", i.a.Name, i.shardID)) 103 } 104 105 func (b *bqExporter) downloadArtifactContent(ctx context.Context, a *artifact, rowC chan rowInput) error { 106 ac := artifactcontent.Reader{ 107 RBEInstance: b.Options.ArtifactRBEInstance, 108 Hash: a.RBECASHash, 109 Size: a.SizeBytes, 110 } 111 112 var str strings.Builder 113 shardId := 0 114 input := func() *textArtifactRowInput { 115 return &textArtifactRowInput{ 116 exported: a.exported, 117 parent: a.parent, 118 a: a.Artifact.Artifact, 119 shardID: int32(shardId), 120 content: str.String(), 121 } 122 } 123 124 err := ac.DownloadRBECASContent(ctx, b.rbecasClient, func(ctx context.Context, pr io.Reader) error { 125 sc := bufio.NewScanner(pr) 126 //var buf []byte 127 sc.Buffer(nil, b.maxTokenSize) 128 129 // Return one line at a time, unless the line exceeds the buffer, then return 130 // data as it is. 131 sc.Split(func(data []byte, atEOF bool) (advance int, token []byte, err error) { 132 if len(data) == 0 { 133 return 0, nil, nil 134 } 135 if i := bytes.IndexByte(data, '\n'); i >= 0 { 136 // We have a full newline-terminated line. 137 return i + 1, data[:i+1], nil 138 } 139 // A partial line occupies the entire buffer, return it as is. 140 return len(data), data, nil 141 }) 142 143 for sc.Scan() { 144 if str.Len()+len(sc.Bytes()) > contentShardSize { 145 select { 146 case <-ctx.Done(): 147 return ctx.Err() 148 case rowC <- input(): 149 } 150 shardId++ 151 str.Reset() 152 } 153 str.Write(sc.Bytes()) 154 } 155 if err := sc.Err(); err != nil { 156 return err 157 } 158 159 if str.Len() > 0 { 160 select { 161 case <-ctx.Done(): 162 return ctx.Err() 163 case rowC <- input(): 164 } 165 } 166 return nil 167 }) 168 return errors.Annotate(err, "read artifact content").Err() 169 } 170 171 type artifact struct { 172 *artifacts.Artifact 173 exported *pb.Invocation 174 parent *pb.Invocation 175 } 176 177 func (b *bqExporter) queryTextArtifacts(ctx context.Context, exportedID invocations.ID, bqExport *pb.BigQueryExport, artifactC chan *artifact) error { 178 exportedInv, err := invocations.Read(ctx, exportedID) 179 if err != nil { 180 return errors.Annotate(err, "error reading exported invocation").Err() 181 } 182 if exportedInv.State != pb.Invocation_FINALIZED { 183 return errors.Reason("%s is not finalized yet", exportedID.Name()).Err() 184 } 185 186 invs, err := graph.Reachable(ctx, invocations.NewIDSet(exportedID)) 187 if err != nil { 188 return errors.Annotate(err, "querying reachable invocations").Err() 189 } 190 for _, batch := range invs.Batches() { 191 contentTypeRegexp := bqExport.GetTextArtifacts().GetPredicate().GetContentTypeRegexp() 192 if contentTypeRegexp == "" { 193 contentTypeRegexp = "text/.*" 194 } 195 batchInvocations, err := batch.IDSet() 196 if err != nil { 197 return err 198 } 199 q := artifacts.Query{ 200 InvocationIDs: batchInvocations, 201 TestResultPredicate: bqExport.GetTextArtifacts().GetPredicate().GetTestResultPredicate(), 202 ContentTypeRegexp: contentTypeRegexp, 203 ArtifactIDRegexp: bqExport.GetTextArtifacts().GetPredicate().GetArtifactIdRegexp(), 204 WithRBECASHash: true, 205 } 206 207 invs, err := invocations.ReadBatch(ctx, q.InvocationIDs) 208 if err != nil { 209 return err 210 } 211 212 err = q.Run(ctx, func(a *artifacts.Artifact) error { 213 invID, _, _, _ := artifacts.MustParseName(a.Name) 214 select { 215 case <-ctx.Done(): 216 return ctx.Err() 217 case artifactC <- &artifact{Artifact: a, exported: exportedInv, parent: invs[invID]}: 218 } 219 return nil 220 }) 221 if err != nil { 222 return errors.Annotate(err, "exporting batch").Err() 223 } 224 } 225 return nil 226 } 227 228 func (b *bqExporter) artifactRowInputToBatch(ctx context.Context, rowC chan rowInput, batchC chan []rowInput) error { 229 rows := make([]rowInput, 0, b.MaxBatchRowCount) 230 batchSize := 0 // Estimated size of rows in bytes. 231 for row := range rowC { 232 contentLength := len(row.(*textArtifactRowInput).content) 233 if len(rows)+1 >= b.MaxBatchRowCount || batchSize+contentLength >= b.MaxBatchSizeApprox { 234 select { 235 case <-ctx.Done(): 236 return ctx.Err() 237 case batchC <- rows: 238 } 239 rows = make([]rowInput, 0, b.MaxBatchRowCount) 240 batchSize = 0 241 } 242 rows = append(rows, row) 243 batchSize += contentLength 244 } 245 if len(rows) > 0 { 246 select { 247 case <-ctx.Done(): 248 return ctx.Err() 249 case batchC <- rows: 250 } 251 } 252 return nil 253 } 254 255 // exportTextArtifactsToBigQuery queries text artifacts in Spanner then exports them to BigQuery. 256 func (b *bqExporter) exportTextArtifactsToBigQuery(ctx context.Context, ins inserter, invID invocations.ID, bqExport *pb.BigQueryExport) error { 257 ctx, cancel := span.ReadOnlyTransaction(ctx) 258 defer cancel() 259 260 // Query artifacts and export to BigQuery. 261 batchC := make(chan []rowInput) 262 rowC := make(chan rowInput) 263 artifactC := make(chan *artifact, artifactWorkers) 264 265 // Batch exports rows to BigQuery. 266 eg, ctx := errgroup.WithContext(ctx) 267 268 eg.Go(func() error { 269 return b.batchExportRows(ctx, ins, batchC, func(ctx context.Context, err bigquery.PutMultiError, rows []*bq.Row) { 270 // Print up to 10 errors. 271 for i := 0; i < 10 && i < len(err); i++ { 272 a := rows[err[i].RowIndex].Message.(*bqpb.TextArtifactRowLegacy) 273 var artifactName string 274 if a.TestId != "" { 275 artifactName = pbutil.TestResultArtifactName(a.Parent.Id, a.TestId, a.ResultId, a.ArtifactId) 276 } else { 277 artifactName = pbutil.InvocationArtifactName(a.Parent.Id, a.ArtifactId) 278 } 279 logging.Errorf(ctx, "failed to insert row for %s: %s", artifactName, err[i].Error()) 280 } 281 if len(err) > 10 { 282 logging.Errorf(ctx, "%d more row insertions failed", len(err)-10) 283 } 284 }) 285 }) 286 287 eg.Go(func() error { 288 defer close(batchC) 289 return errors.Annotate(b.artifactRowInputToBatch(ctx, rowC, batchC), "artifact row input to batch").Err() 290 }) 291 292 eg.Go(func() error { 293 defer close(rowC) 294 295 subEg, ctx := errgroup.WithContext(ctx) 296 for w := 0; w < artifactWorkers; w++ { 297 subEg.Go(func() error { 298 for a := range artifactC { 299 if err := b.downloadArtifactContent(ctx, a, rowC); err != nil { 300 return err 301 } 302 } 303 return nil 304 }) 305 } 306 return subEg.Wait() 307 }) 308 309 eg.Go(func() error { 310 defer close(artifactC) 311 return b.queryTextArtifacts(ctx, invID, bqExport, artifactC) 312 }) 313 314 return eg.Wait() 315 }