github.com/distbuild/reclient@v0.0.0-20240401075343-3de72e395564/cmd/bigquery/main.go (about)

     1  // Copyright 2023 Google LLC
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Binary biqeury is used to stream an reproxy_log generated from a build
    16  // using re-client to bigquery so that it can be further queried upon.
    17  //
    18  // Example invocation (assuming the bigquery table already exists):
    19  //
    20  //	bazelisk run //cmd/bigquery:bigquery -- \
    21  //	  --log_path text:///tmp/reproxy_log.txt \
    22  //	  --alsologtostderr=true \
    23  //	  --table <bigquery-dataset-id>.<bigquery-table-id> \
    24  //	  --project_id <gcp-project-id> # (ex:"foundry-x-experiments")
    25  //
    26  // If you don't have a bigquery table yet, you can create it using the following steps:
    27  //  1. Run scripts/gen_reproxy_log_big_query_schema.sh
    28  //  2. Run the following command using "bq" tool to create table:
    29  //     bq mk --table \
    30  //     --expiration 600 \ # in seconds. This argument is optional and the table doesn't expire if you don't set it.
    31  //     foundry-x-experiments:reproxylogs.reproxy_log_1 \ # Format: <project-id>:<dataset-id>.<table-id>
    32  //     `pwd`/reproxy_log_bigquery_schema/proxy/reproxy_log.schema
    33  //
    34  // Note: It can take upto 5mins for the bigquery table to become active
    35  // after it is created.
    36  package main
    37  
    38  import (
    39  	"context"
    40  	"flag"
    41  	"fmt"
    42  	"sync"
    43  	"time"
    44  
    45  	"github.com/bazelbuild/reclient/internal/pkg/bigquery"
    46  	"github.com/bazelbuild/reclient/internal/pkg/bigquerytranslator"
    47  	"github.com/bazelbuild/reclient/internal/pkg/logger"
    48  	"github.com/bazelbuild/reclient/internal/pkg/rbeflag"
    49  
    50  	lpb "github.com/bazelbuild/reclient/api/log"
    51  
    52  	log "github.com/golang/glog"
    53  	"golang.org/x/sync/errgroup"
    54  )
    55  
    56  var (
    57  	// TODO: support --proxy_log_dir.
    58  	logPath          = flag.String("log_path", "", "If provided, the path to a log file of all executed records. The format is e.g. text:///full/file/path.")
    59  	projectID        = flag.String("project_id", "foundry-x-experiments", "The project containing the big query table to which log records should be streamed to.")
    60  	tableSpec        = flag.String("table", "reproxy_log_test.test_1", "Resource specifier of the BigQuery to which log records should be streamed to. If the project is not provided in the specifier project_id will be used.")
    61  	numConcurrentOps = flag.Int("num_concurrent_uploads", 100, "Number of concurrent upload operations to perform.")
    62  )
    63  
    64  func insertRows(logs []*lpb.LogRecord) error {
    65  	ctx := context.Background()
    66  	inserter, cleanup, err := bigquery.NewInserter(ctx, *tableSpec, *projectID, nil)
    67  	defer cleanup()
    68  	if err != nil {
    69  		return fmt.Errorf("bigquery.NewInserter: %v", err)
    70  	}
    71  
    72  	items := make(chan *bigquerytranslator.Item, *numConcurrentOps)
    73  
    74  	g, _ := errgroup.WithContext(context.Background())
    75  
    76  	var processed int32
    77  	var processedMu sync.Mutex
    78  
    79  	for i := 0; i < *numConcurrentOps; i++ {
    80  		g.Go(func() error {
    81  			for item := range items {
    82  				if err := inserter.Put(ctx, item); err != nil {
    83  					// In case of error (gpaste/6313679673360384), retrying the job with
    84  					// back-off as described in BigQuery Service Level Agreement
    85  					// https://cloud.google.com/bigquery/sla
    86  					time.Sleep(1 * time.Second)
    87  					if err := inserter.Put(ctx, item); err != nil {
    88  						log.Errorf("Failed to insert record after retry: %v", err)
    89  						return err
    90  					}
    91  				}
    92  				processedMu.Lock()
    93  				processed++
    94  				processedMu.Unlock()
    95  			}
    96  			return nil
    97  		})
    98  	}
    99  	go func() {
   100  		for range time.Tick(5 * time.Second) {
   101  			processedMu.Lock()
   102  			log.Infof("Finished %v/%v items...", int(processed), len(logs))
   103  			processedMu.Unlock()
   104  		}
   105  	}()
   106  	if len(logs) < 1 {
   107  		log.Infof("No items to load to bigquery.")
   108  		return nil
   109  	}
   110  	log.Infof("Total number of items: %v", len(logs))
   111  	for _, r := range logs {
   112  		items <- &bigquerytranslator.Item{r}
   113  	}
   114  	close(items)
   115  
   116  	if err := g.Wait(); err != nil {
   117  		log.Errorf("Error while uploading to bigquery: %v", err)
   118  	}
   119  	return nil
   120  }
   121  
   122  func main() {
   123  	defer log.Flush()
   124  	rbeflag.Parse()
   125  	if *logPath == "" {
   126  		log.Fatal("Must provide proxy log path.")
   127  	}
   128  
   129  	log.Infof("Loading stats from %v...", *logPath)
   130  	logRecords, err := logger.ParseFromFormatFile(*logPath)
   131  	if err != nil {
   132  		log.Fatalf("Failed reading proxy log: %v", err)
   133  	}
   134  
   135  	log.Infof("Inserting stats into bigquery table...")
   136  	if err := insertRows(logRecords); err != nil {
   137  		log.Fatalf("Unable to insert records into bigquery table: %+v", err)
   138  	}
   139  }