go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/resultdb/internal/services/bqexporter/text_artifact_row.go (about)

     1  // Copyright 2021 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package bqexporter
    16  
    17  import (
    18  	"bufio"
    19  	"bytes"
    20  	"context"
    21  	"fmt"
    22  	"io"
    23  	"strings"
    24  
    25  	"cloud.google.com/go/bigquery"
    26  	"github.com/golang/protobuf/descriptor"
    27  	desc "github.com/golang/protobuf/protoc-gen-go/descriptor"
    28  	"golang.org/x/sync/errgroup"
    29  	"google.golang.org/protobuf/proto"
    30  
    31  	"go.chromium.org/luci/common/bq"
    32  	"go.chromium.org/luci/common/errors"
    33  	"go.chromium.org/luci/common/logging"
    34  	"go.chromium.org/luci/server/span"
    35  
    36  	"go.chromium.org/luci/resultdb/bqutil"
    37  	"go.chromium.org/luci/resultdb/internal/artifactcontent"
    38  	"go.chromium.org/luci/resultdb/internal/artifacts"
    39  	"go.chromium.org/luci/resultdb/internal/invocations"
    40  	"go.chromium.org/luci/resultdb/internal/invocations/graph"
    41  	"go.chromium.org/luci/resultdb/pbutil"
    42  	bqpb "go.chromium.org/luci/resultdb/proto/bq"
    43  	pb "go.chromium.org/luci/resultdb/proto/v1"
    44  )
    45  
    46  var textArtifactRowSchema bigquery.Schema
    47  
    48  const (
    49  	artifactRowMessage = "luci.resultdb.bq.TextArtifactRowLegacy"
    50  
    51  	// Row size limit is 5MB according to
    52  	// https://cloud.google.com/bigquery/quotas#streaming_inserts
    53  	// Split artifact content into 4MB shards if it's too large.
    54  	contentShardSize = 4e6
    55  
    56  	// Number of workers to download artifact content.
    57  	artifactWorkers = 10
    58  )
    59  
    60  func init() {
    61  	var err error
    62  	if textArtifactRowSchema, err = generateArtifactRowSchema(); err != nil {
    63  		panic(err)
    64  	}
    65  }
    66  
    67  func generateArtifactRowSchema() (schema bigquery.Schema, err error) {
    68  	fd, _ := descriptor.MessageDescriptorProto(&bqpb.TextArtifactRowLegacy{})
    69  	fdinv, _ := descriptor.MessageDescriptorProto(&bqpb.InvocationRecord{})
    70  	fdsp, _ := descriptor.MessageDescriptorProto(&pb.StringPair{})
    71  	fdset := &desc.FileDescriptorSet{File: []*desc.FileDescriptorProto{fd, fdinv, fdsp}}
    72  	return bqutil.GenerateSchema(fdset, artifactRowMessage)
    73  }
    74  
    75  // textArtifactRowInput is information required to generate a text artifact BigQuery row.
    76  type textArtifactRowInput struct {
    77  	exported *pb.Invocation
    78  	parent   *pb.Invocation
    79  	a        *pb.Artifact
    80  	shardID  int32
    81  	content  string
    82  }
    83  
    84  func (i *textArtifactRowInput) row() proto.Message {
    85  	_, testID, resultID, artifactID := artifacts.MustParseName(i.a.Name)
    86  	expRec := invocationProtoToRecord(i.exported)
    87  	parRec := invocationProtoToRecord(i.parent)
    88  
    89  	return &bqpb.TextArtifactRowLegacy{
    90  		Exported:      expRec,
    91  		Parent:        parRec,
    92  		TestId:        testID,
    93  		ResultId:      resultID,
    94  		ArtifactId:    artifactID,
    95  		ShardId:       i.shardID,
    96  		Content:       i.content,
    97  		PartitionTime: i.exported.CreateTime,
    98  	}
    99  }
   100  
   101  func (i *textArtifactRowInput) id() []byte {
   102  	return []byte(fmt.Sprintf("%s/%d", i.a.Name, i.shardID))
   103  }
   104  
   105  func (b *bqExporter) downloadArtifactContent(ctx context.Context, a *artifact, rowC chan rowInput) error {
   106  	ac := artifactcontent.Reader{
   107  		RBEInstance: b.Options.ArtifactRBEInstance,
   108  		Hash:        a.RBECASHash,
   109  		Size:        a.SizeBytes,
   110  	}
   111  
   112  	var str strings.Builder
   113  	shardId := 0
   114  	input := func() *textArtifactRowInput {
   115  		return &textArtifactRowInput{
   116  			exported: a.exported,
   117  			parent:   a.parent,
   118  			a:        a.Artifact.Artifact,
   119  			shardID:  int32(shardId),
   120  			content:  str.String(),
   121  		}
   122  	}
   123  
   124  	err := ac.DownloadRBECASContent(ctx, b.rbecasClient, func(ctx context.Context, pr io.Reader) error {
   125  		sc := bufio.NewScanner(pr)
   126  		//var buf []byte
   127  		sc.Buffer(nil, b.maxTokenSize)
   128  
   129  		// Return one line at a time, unless the line exceeds the buffer, then return
   130  		// data as it is.
   131  		sc.Split(func(data []byte, atEOF bool) (advance int, token []byte, err error) {
   132  			if len(data) == 0 {
   133  				return 0, nil, nil
   134  			}
   135  			if i := bytes.IndexByte(data, '\n'); i >= 0 {
   136  				// We have a full newline-terminated line.
   137  				return i + 1, data[:i+1], nil
   138  			}
   139  			// A partial line occupies the entire buffer, return it as is.
   140  			return len(data), data, nil
   141  		})
   142  
   143  		for sc.Scan() {
   144  			if str.Len()+len(sc.Bytes()) > contentShardSize {
   145  				select {
   146  				case <-ctx.Done():
   147  					return ctx.Err()
   148  				case rowC <- input():
   149  				}
   150  				shardId++
   151  				str.Reset()
   152  			}
   153  			str.Write(sc.Bytes())
   154  		}
   155  		if err := sc.Err(); err != nil {
   156  			return err
   157  		}
   158  
   159  		if str.Len() > 0 {
   160  			select {
   161  			case <-ctx.Done():
   162  				return ctx.Err()
   163  			case rowC <- input():
   164  			}
   165  		}
   166  		return nil
   167  	})
   168  	return errors.Annotate(err, "read artifact content").Err()
   169  }
   170  
   171  type artifact struct {
   172  	*artifacts.Artifact
   173  	exported *pb.Invocation
   174  	parent   *pb.Invocation
   175  }
   176  
   177  func (b *bqExporter) queryTextArtifacts(ctx context.Context, exportedID invocations.ID, bqExport *pb.BigQueryExport, artifactC chan *artifact) error {
   178  	exportedInv, err := invocations.Read(ctx, exportedID)
   179  	if err != nil {
   180  		return errors.Annotate(err, "error reading exported invocation").Err()
   181  	}
   182  	if exportedInv.State != pb.Invocation_FINALIZED {
   183  		return errors.Reason("%s is not finalized yet", exportedID.Name()).Err()
   184  	}
   185  
   186  	invs, err := graph.Reachable(ctx, invocations.NewIDSet(exportedID))
   187  	if err != nil {
   188  		return errors.Annotate(err, "querying reachable invocations").Err()
   189  	}
   190  	for _, batch := range invs.Batches() {
   191  		contentTypeRegexp := bqExport.GetTextArtifacts().GetPredicate().GetContentTypeRegexp()
   192  		if contentTypeRegexp == "" {
   193  			contentTypeRegexp = "text/.*"
   194  		}
   195  		batchInvocations, err := batch.IDSet()
   196  		if err != nil {
   197  			return err
   198  		}
   199  		q := artifacts.Query{
   200  			InvocationIDs:       batchInvocations,
   201  			TestResultPredicate: bqExport.GetTextArtifacts().GetPredicate().GetTestResultPredicate(),
   202  			ContentTypeRegexp:   contentTypeRegexp,
   203  			ArtifactIDRegexp:    bqExport.GetTextArtifacts().GetPredicate().GetArtifactIdRegexp(),
   204  			WithRBECASHash:      true,
   205  		}
   206  
   207  		invs, err := invocations.ReadBatch(ctx, q.InvocationIDs)
   208  		if err != nil {
   209  			return err
   210  		}
   211  
   212  		err = q.Run(ctx, func(a *artifacts.Artifact) error {
   213  			invID, _, _, _ := artifacts.MustParseName(a.Name)
   214  			select {
   215  			case <-ctx.Done():
   216  				return ctx.Err()
   217  			case artifactC <- &artifact{Artifact: a, exported: exportedInv, parent: invs[invID]}:
   218  			}
   219  			return nil
   220  		})
   221  		if err != nil {
   222  			return errors.Annotate(err, "exporting batch").Err()
   223  		}
   224  	}
   225  	return nil
   226  }
   227  
   228  func (b *bqExporter) artifactRowInputToBatch(ctx context.Context, rowC chan rowInput, batchC chan []rowInput) error {
   229  	rows := make([]rowInput, 0, b.MaxBatchRowCount)
   230  	batchSize := 0 // Estimated size of rows in bytes.
   231  	for row := range rowC {
   232  		contentLength := len(row.(*textArtifactRowInput).content)
   233  		if len(rows)+1 >= b.MaxBatchRowCount || batchSize+contentLength >= b.MaxBatchSizeApprox {
   234  			select {
   235  			case <-ctx.Done():
   236  				return ctx.Err()
   237  			case batchC <- rows:
   238  			}
   239  			rows = make([]rowInput, 0, b.MaxBatchRowCount)
   240  			batchSize = 0
   241  		}
   242  		rows = append(rows, row)
   243  		batchSize += contentLength
   244  	}
   245  	if len(rows) > 0 {
   246  		select {
   247  		case <-ctx.Done():
   248  			return ctx.Err()
   249  		case batchC <- rows:
   250  		}
   251  	}
   252  	return nil
   253  }
   254  
   255  // exportTextArtifactsToBigQuery queries text artifacts in Spanner then exports them to BigQuery.
   256  func (b *bqExporter) exportTextArtifactsToBigQuery(ctx context.Context, ins inserter, invID invocations.ID, bqExport *pb.BigQueryExport) error {
   257  	ctx, cancel := span.ReadOnlyTransaction(ctx)
   258  	defer cancel()
   259  
   260  	// Query artifacts and export to BigQuery.
   261  	batchC := make(chan []rowInput)
   262  	rowC := make(chan rowInput)
   263  	artifactC := make(chan *artifact, artifactWorkers)
   264  
   265  	// Batch exports rows to BigQuery.
   266  	eg, ctx := errgroup.WithContext(ctx)
   267  
   268  	eg.Go(func() error {
   269  		return b.batchExportRows(ctx, ins, batchC, func(ctx context.Context, err bigquery.PutMultiError, rows []*bq.Row) {
   270  			// Print up to 10 errors.
   271  			for i := 0; i < 10 && i < len(err); i++ {
   272  				a := rows[err[i].RowIndex].Message.(*bqpb.TextArtifactRowLegacy)
   273  				var artifactName string
   274  				if a.TestId != "" {
   275  					artifactName = pbutil.TestResultArtifactName(a.Parent.Id, a.TestId, a.ResultId, a.ArtifactId)
   276  				} else {
   277  					artifactName = pbutil.InvocationArtifactName(a.Parent.Id, a.ArtifactId)
   278  				}
   279  				logging.Errorf(ctx, "failed to insert row for %s: %s", artifactName, err[i].Error())
   280  			}
   281  			if len(err) > 10 {
   282  				logging.Errorf(ctx, "%d more row insertions failed", len(err)-10)
   283  			}
   284  		})
   285  	})
   286  
   287  	eg.Go(func() error {
   288  		defer close(batchC)
   289  		return errors.Annotate(b.artifactRowInputToBatch(ctx, rowC, batchC), "artifact row input to batch").Err()
   290  	})
   291  
   292  	eg.Go(func() error {
   293  		defer close(rowC)
   294  
   295  		subEg, ctx := errgroup.WithContext(ctx)
   296  		for w := 0; w < artifactWorkers; w++ {
   297  			subEg.Go(func() error {
   298  				for a := range artifactC {
   299  					if err := b.downloadArtifactContent(ctx, a, rowC); err != nil {
   300  						return err
   301  					}
   302  				}
   303  				return nil
   304  			})
   305  		}
   306  		return subEg.Wait()
   307  	})
   308  
   309  	eg.Go(func() error {
   310  		defer close(artifactC)
   311  		return b.queryTextArtifacts(ctx, invID, bqExport, artifactC)
   312  	})
   313  
   314  	return eg.Wait()
   315  }