go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/appengine/bqlog/bqlog.go

go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/appengine/bqlog/bqlog.go (about)

     1  // Copyright 2016 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package bqlog provides a mechanism to asynchronously log rows to BigQuery.
    16  //
    17  // Deprecated: this package depends on Pull Task Queues which are deprecated and
    18  // not available from the GAE second-gen runtime or from Kubernetes. The
    19  // replacement is go.chromium.org/luci/server/tq PubSub tasks, plus a PubSub
    20  // push subscription with a handler that simply inserts rows into BigQuery.
    21  package bqlog
    22  
    23  import (
    24  	"bytes"
    25  	"context"
    26  	"encoding/gob"
    27  	"encoding/json"
    28  	"fmt"
    29  	"net/http"
    30  	"strings"
    31  	"sync"
    32  	"sync/atomic"
    33  	"time"
    34  
    35  	"cloud.google.com/go/bigquery"
    36  	bqapi "google.golang.org/api/bigquery/v2"
    37  	"google.golang.org/api/googleapi"
    38  
    39  	"go.chromium.org/luci/gae/service/info"
    40  	"go.chromium.org/luci/gae/service/taskqueue"
    41  
    42  	"go.chromium.org/luci/common/clock"
    43  	"go.chromium.org/luci/common/errors"
    44  	"go.chromium.org/luci/common/logging"
    45  	"go.chromium.org/luci/common/retry"
    46  	"go.chromium.org/luci/common/retry/transient"
    47  
    48  	"go.chromium.org/luci/common/tsmon/distribution"
    49  	"go.chromium.org/luci/common/tsmon/field"
    50  	"go.chromium.org/luci/common/tsmon/metric"
    51  	"go.chromium.org/luci/common/tsmon/types"
    52  
    53  	"go.chromium.org/luci/server/auth"
    54  )
    55  
    56  const (
    57  	defaultBatchesPerRequest  = 250
    58  	defaultMaxParallelUploads = 64
    59  	defaultFlushTimeout       = time.Minute
    60  )
    61  
    62  var (
    63  	// This can be used to estimate how many events are produced.
    64  	insertedEntryCount = metric.NewCounter(
    65  		"luci/bqlog/inserted_entry_count",
    66  		"Total number of log entries successfully added in Insert(...).",
    67  		nil,
    68  		field.String("table")) // "<projID>/<datasetID>/<tableID>"
    69  
    70  	// To track the performance of Insert(...).
    71  	insertLatency = metric.NewCumulativeDistribution(
    72  		"luci/bqlog/insert_latency",
    73  		"Distribution of Insert(...) call latencies.",
    74  		&types.MetricMetadata{Units: types.Milliseconds},
    75  		distribution.DefaultBucketer,
    76  		field.String("table"),  // "<projID>/<datasetID>/<tableID>"
    77  		field.String("status")) // "ok" or "fail"
    78  
    79  	// To track the performance of 'Flush'.
    80  	flushLatency = metric.NewCumulativeDistribution(
    81  		"luci/bqlog/flush_latency",
    82  		"Distribution of Flush(...) call latencies.",
    83  		&types.MetricMetadata{Units: types.Milliseconds},
    84  		distribution.DefaultBucketer,
    85  		field.String("table"),  // "<projID>/<datasetID>/<tableID>"
    86  		field.String("status")) // "ok", "fail" or "warning"
    87  
    88  	// This is perhaps the most important metric, since it shows a number of rows
    89  	// skipped during the flush due to schema mismatch or other BigQuery errors.
    90  	flushedEntryCount = metric.NewCounter(
    91  		"luci/bqlog/flushed_entry_count",
    92  		"Total number of rows sent to BigQuery (including rejected rows).",
    93  		nil,
    94  		field.String("table"),  // "<projID>/<datasetID>/<tableID>"
    95  		field.String("status")) // "ok" or whatever error reason BigQuery returns
    96  
    97  	// Stats of individual BigQuery API calls (including retries).
    98  	bigQueryLatency = metric.NewCumulativeDistribution(
    99  		"luci/bqlog/bigquery_latency",
   100  		"Distribution of BigQuery API call latencies.",
   101  		&types.MetricMetadata{Units: types.Milliseconds},
   102  		distribution.DefaultBucketer,
   103  		field.String("table"),  // "<projID>/<datasetID>/<tableID>"
   104  		field.String("method"), // name of the API method, e.g. "insertAll"
   105  		field.String("status")) // "ok, "http_400", ..., "timeout" or "unknown"
   106  
   107  	// This can be used to estimate a queuing backlog.
   108  	pullQueueLen = metric.NewInt(
   109  		"luci/bqlog/pullqueue_len",
   110  		"Number of tasks in the associated Pull Queue (prior to Flush call).",
   111  		nil,
   112  		field.String("table")) // "<projID>/<datasetID>/<tableID>"
   113  
   114  	// This estimates queuing delay and any GAE scheduling hickups.
   115  	pullQueueLatency = metric.NewFloat(
   116  		"luci/bqlog/pullqueue_latency",
   117  		"Age of the oldest task in the queue or 0 if the queue is empty.",
   118  		&types.MetricMetadata{Units: types.Milliseconds},
   119  		field.String("table")) // "<projID>/<datasetID>/<tableID>"
   120  )
   121  
   122  // Log can be used to insert entries into a BigQuery table.
   123  type Log struct {
   124  	// QueueName is a name of a pull queue to use as a buffer for inserts.
   125  	//
   126  	// Required. It must be defined in queue.yaml file and it must not be used by
   127  	// any other Log object.
   128  	QueueName string
   129  
   130  	// ProjectID is Cloud Project that owns the dataset.
   131  	//
   132  	// If empty, will be derived from the current app ID.
   133  	ProjectID string
   134  
   135  	// DatasetID identifies the already existing dataset that contains the table.
   136  	//
   137  	// Required.
   138  	DatasetID string
   139  
   140  	// TableID identifies the name of the table in the dataset.
   141  	//
   142  	// Required. The table must exist already.
   143  	TableID string
   144  
   145  	// BatchesPerRequest is how many batches of entries to send in one BQ insert.
   146  	//
   147  	// A call to 'Insert' generates one batch of entries, thus BatchesPerRequest
   148  	// essentially specifies how many 'Insert's to clump together when sending
   149  	// data to BigQuery. If your Inserts are known to be huge, lowering this value
   150  	// may help to avoid hitting memory limits.
   151  	//
   152  	// Default is 250. It assumes your batches are very small (1-3 rows), which
   153  	// is usually the case if events are generated by online RPC handlers.
   154  	BatchesPerRequest int
   155  
   156  	// MaxParallelUploads is how many parallel ops to do when flushing.
   157  	//
   158  	// We limit it to avoid hitting OOM errors on GAE.
   159  	//
   160  	// Default is 64.
   161  	MaxParallelUploads int
   162  
   163  	// FlushTimeout is maximum duration to spend in fetching from Pull Queue in
   164  	// 'Flush'.
   165  	//
   166  	// We limit it to make sure 'Flush' has a chance to finish running before
   167  	// GAE kills it by deadline. Next time 'Flush' is called, it will resume
   168  	// flushing from where it left off.
   169  	//
   170  	// Note that 'Flush' can run for slightly longer, since it waits for all
   171  	// pulled data to be flushed before returning.
   172  	//
   173  	// Default is 1 min.
   174  	FlushTimeout time.Duration
   175  
   176  	// DumpEntriesToLogger makes 'Insert' log all entries (at debug level).
   177  	DumpEntriesToLogger bool
   178  
   179  	// DryRun disables the actual uploads (keeps the local logging though).
   180  	DryRun bool
   181  
   182  	// insertMock is used to mock BigQuery insertAll call in tests.
   183  	insertMock func(context.Context, *bqapi.TableDataInsertAllRequest) (*bqapi.TableDataInsertAllResponse, error)
   184  	// beforeSendChunk is used in tests to signal that 'sendChunk' is called.
   185  	beforeSendChunk func(context.Context, []*taskqueue.Task)
   186  }
   187  
   188  // rawEntry is a single structured entry in the log.
   189  //
   190  // It gets gob-serialized and put into Task Queue.
   191  type rawEntry struct {
   192  	InsertID string
   193  	Data     map[string]bqapi.JsonValue // more like map[string]json.RawMessage
   194  }
   195  
   196  func init() {
   197  	gob.Register(json.RawMessage{})
   198  }
   199  
   200  // valuesToJSON converts bigquery.Value map to a simplest JSON-marshallable
   201  // bqapi.JsonValue map.
   202  //
   203  // bigquery.Value can actually be pretty complex (e.g. time.Time, big.Rat),
   204  // which makes it difficult to put into gob-encodable rawEntry.
   205  //
   206  // So instead we convert it to JSON before putting into Gob (bqapi library does
   207  // it later anyway). But since BigQuery API wants a map[string]bqapi.JsonValue
   208  // we return map[string]json.RawMessage instead of complete raw []byte.
   209  func valuesToJSON(in map[string]bigquery.Value) (map[string]bqapi.JsonValue, error) {
   210  	if len(in) == 0 {
   211  		return nil, nil
   212  	}
   213  	out := make(map[string]bqapi.JsonValue, len(in))
   214  	for k, v := range in {
   215  		blob, err := json.Marshal(v)
   216  		if err != nil {
   217  			return nil, errors.Annotate(err, "failed to JSON-serialize key %q", k).Err()
   218  		}
   219  		out[k] = json.RawMessage(blob)
   220  	}
   221  	return out, nil
   222  }
   223  
   224  // Insert adds a bunch of entries to the buffer of pending entries.
   225  //
   226  // It will reuse existing datastore transaction (if any). This allows to
   227  // log entries transactionally when changing something in the datastore.
   228  //
   229  // Empty inserts IDs will be replaced with autogenerated ones (they start with
   230  // 'bqlog:'). Entries not matching the schema are logged and skipped during
   231  // the flush.
   232  func (l *Log) Insert(ctx context.Context, rows ...bigquery.ValueSaver) (err error) {
   233  	if len(rows) == 0 {
   234  		return nil
   235  	}
   236  
   237  	entries := make([]rawEntry, len(rows))
   238  	for i, r := range rows {
   239  		values, iid, err := r.Save()
   240  		if err != nil {
   241  			return errors.Annotate(err, "failure when saving row #%d", i).Err()
   242  		}
   243  		if entries[i].Data, err = valuesToJSON(values); err != nil {
   244  			return errors.Annotate(err, "failure when serializing row #%d", i).Err()
   245  		}
   246  		entries[i].InsertID = iid
   247  	}
   248  
   249  	if l.DumpEntriesToLogger && logging.IsLogging(ctx, logging.Debug) {
   250  		for idx, entry := range entries {
   251  			blob, err := json.MarshalIndent(entry.Data, "", "  ")
   252  			if err != nil {
   253  				logging.WithError(err).Errorf(ctx, "Failed to serialize the row #%d", idx)
   254  			} else {
   255  				logging.Debugf(ctx, "BigQuery row #%d for %s/%s:\n%s", idx, l.DatasetID, l.TableID, blob)
   256  			}
   257  		}
   258  	}
   259  
   260  	if l.DryRun {
   261  		return nil
   262  	}
   263  
   264  	// We need tableRef to report the metrics, thus an error to get tableRef is
   265  	// NOT reported to tsmon. It happens only if TableID or DatasetID are
   266  	// malformed.
   267  	tableRef, err := l.tableRef(ctx)
   268  	if err != nil {
   269  		return err
   270  	}
   271  
   272  	startTime := clock.Now(ctx)
   273  	defer func() {
   274  		dt := clock.Since(ctx, startTime)
   275  		status := "fail"
   276  		if err == nil {
   277  			status = "ok"
   278  			insertedEntryCount.Add(ctx, int64(len(entries)), tableRef)
   279  		}
   280  		insertLatency.Add(ctx, float64(dt.Nanoseconds()/1e6), tableRef, status)
   281  	}()
   282  
   283  	buf := bytes.Buffer{}
   284  	if err := gob.NewEncoder(&buf).Encode(entries); err != nil {
   285  		return err
   286  	}
   287  	return taskqueue.Add(ctx, l.QueueName, &taskqueue.Task{
   288  		Method:  "PULL",
   289  		Payload: buf.Bytes(),
   290  	})
   291  }
   292  
   293  // Flush pulls buffered rows from Pull Queue and sends them to BigQuery.
   294  //
   295  // Must be called periodically from some cron job. It is okay to call 'Flush'
   296  // concurrently from multiple processes to speed up the upload.
   297  //
   298  // It succeeds if all entries it attempted to send were successfully handled by
   299  // BigQuery. If some entries are malformed, it logs the error and skip them,
   300  // so they don't get stuck in the pending buffer forever. This corresponds to
   301  // 'skipInvalidRows=true' in 'insertAll' BigQuery call.
   302  //
   303  // Returns number of rows sent to BigQuery. May return both non zero number of
   304  // rows and an error if something bad happened midway.
   305  func (l *Log) Flush(ctx context.Context) (int, error) {
   306  	tableRef, err := l.tableRef(ctx)
   307  	if err != nil {
   308  		return 0, err
   309  	}
   310  	ctx = logging.SetFields(ctx, logging.Fields{"table": tableRef})
   311  	logging.Infof(ctx, "Flush started")
   312  
   313  	startTime := clock.Now(ctx)
   314  
   315  	softDeadline := startTime.Add(l.flushTimeout()) // when to stop pulling tasks
   316  	hardDeadline := softDeadline.Add(time.Minute)   // when to abort all calls
   317  
   318  	softDeadlineCtx, cancel := clock.WithDeadline(ctx, softDeadline)
   319  	defer cancel()
   320  	hardDeadlineCtx, cancel := clock.WithDeadline(ctx, hardDeadline)
   321  	defer cancel()
   322  
   323  	stats, err := taskqueue.Stats(ctx, l.QueueName)
   324  	if err != nil {
   325  		logging.WithError(err).Warningf(ctx, "Failed to query stats of queue %q", l.QueueName)
   326  	} else {
   327  		var age time.Duration
   328  		if eta := stats[0].OldestETA; !eta.IsZero() {
   329  			age = clock.Now(ctx).Sub(eta)
   330  		}
   331  		pullQueueLatency.Set(ctx, float64(age.Nanoseconds()/1e6), tableRef)
   332  		pullQueueLen.Set(ctx, int64(stats[0].Tasks), tableRef)
   333  	}
   334  
   335  	// Lease pending upload tasks, split them into 'BatchesPerRequest' chunks,
   336  	// upload all chunks in parallel (limiting the number of concurrent
   337  	// uploads).
   338  	flusher := asyncFlusher{
   339  		Context:  hardDeadlineCtx,
   340  		TableRef: tableRef,
   341  		Insert:   l.insert,
   342  	}
   343  	flusher.start(l.maxParallelUploads())
   344  
   345  	// We lease batches until we run out of time or there's nothing more to lease.
   346  	// On errors or RPC deadlines we slow down, but carry on. We lease until hard
   347  	// deadline. Note that losing a lease is not a catastrophic event: BigQuery
   348  	// still should be able to remove duplicates based on insertID.
   349  	var lastLeaseErr error
   350  	sleep := time.Second
   351  	for clock.Now(ctx).Before(softDeadline) {
   352  		rpcCtx, cancel := clock.WithTimeout(softDeadlineCtx, 15*time.Second) // RPC timeout
   353  		tasks, err := taskqueue.Lease(rpcCtx, l.batchesPerRequest(), l.QueueName, hardDeadline.Sub(clock.Now(ctx)))
   354  		cancel()
   355  		if err != nil {
   356  			lastLeaseErr = err
   357  			if clock.Now(ctx).Add(sleep).After(softDeadline) {
   358  				logging.Warningf(ctx, "Error while leasing, giving up: %s", err)
   359  				break
   360  			}
   361  			logging.Warningf(ctx, "Error while leasing, sleeping %s: %s", err, sleep)
   362  			clock.Sleep(clock.Tag(softDeadlineCtx, "lease-retry"), sleep)
   363  			sleep *= 2
   364  			continue
   365  		}
   366  		sleep = time.Second
   367  		if len(tasks) == 0 {
   368  			break
   369  		}
   370  		if l.beforeSendChunk != nil {
   371  			l.beforeSendChunk(ctx, tasks)
   372  		}
   373  		flusher.sendChunk(chunk{
   374  			Tasks: tasks,
   375  			Done: func(ctx context.Context) {
   376  				logging.Infof(ctx, "Deleting %d tasks from the task queue", len(tasks))
   377  				ctx, cancel := clock.WithTimeout(ctx, 30*time.Second) // RPC timeout
   378  				defer cancel()
   379  				if err := taskqueue.Delete(ctx, l.QueueName, tasks...); err != nil {
   380  					logging.WithError(err).Errorf(ctx, "Failed to delete some tasks")
   381  				}
   382  			},
   383  		})
   384  	}
   385  
   386  	sent, err := flusher.waitAll()
   387  	logging.Infof(ctx, "Flush finished, sent %d rows", sent)
   388  
   389  	if err == nil {
   390  		err = lastLeaseErr
   391  	}
   392  
   393  	dt := clock.Since(ctx, startTime)
   394  	status := "ok"
   395  	switch {
   396  	case err != nil && sent == 0:
   397  		status = "fail"
   398  	case err != nil && sent != 0:
   399  		status = "warning"
   400  	}
   401  	flushLatency.Add(ctx, float64(dt.Nanoseconds()/1e6), tableRef, status)
   402  
   403  	return sent, err
   404  }
   405  
   406  func (l *Log) batchesPerRequest() int {
   407  	if l.BatchesPerRequest > 0 {
   408  		return l.BatchesPerRequest
   409  	}
   410  	return defaultBatchesPerRequest
   411  }
   412  
   413  func (l *Log) maxParallelUploads() int {
   414  	if l.MaxParallelUploads > 0 {
   415  		return l.MaxParallelUploads
   416  	}
   417  	return defaultMaxParallelUploads
   418  }
   419  
   420  func (l *Log) flushTimeout() time.Duration {
   421  	if l.FlushTimeout > 0 {
   422  		return l.FlushTimeout
   423  	}
   424  	return defaultFlushTimeout
   425  }
   426  
   427  func (l *Log) insert(ctx context.Context, r *bqapi.TableDataInsertAllRequest) (*bqapi.TableDataInsertAllResponse, error) {
   428  	if l.insertMock != nil {
   429  		return l.insertMock(ctx, r)
   430  	}
   431  	return l.doInsert(ctx, r)
   432  }
   433  
   434  // projID returns ProjectID or a GAE app ID if ProjectID is "".
   435  func (l *Log) projID(ctx context.Context) string {
   436  	if l.ProjectID == "" {
   437  		return info.TrimmedAppID(ctx)
   438  	}
   439  	return l.ProjectID
   440  }
   441  
   442  // tableRef returns an identifier of the table in BigQuery.
   443  //
   444  // Returns an error if Log is misconfigred.
   445  func (l *Log) tableRef(ctx context.Context) (string, error) {
   446  	projID := l.projID(ctx)
   447  	if projID == "" || strings.ContainsRune(projID, '/') {
   448  		return "", fmt.Errorf("invalid project ID %q", projID)
   449  	}
   450  	if l.DatasetID == "" || strings.ContainsRune(l.DatasetID, '/') {
   451  		return "", fmt.Errorf("invalid dataset ID %q", l.DatasetID)
   452  	}
   453  	if l.TableID == "" || strings.ContainsRune(l.TableID, '/') {
   454  		return "", fmt.Errorf("invalid table ID %q", l.TableID)
   455  	}
   456  	return fmt.Sprintf("%s/%s/%s", projID, l.DatasetID, l.TableID), nil
   457  }
   458  
   459  // bigQuery constructs an instance of BigQuery API client with proper auth.
   460  func (l *Log) bigQuery(ctx context.Context) (*bqapi.Service, error) {
   461  	tr, err := auth.GetRPCTransport(ctx, auth.AsSelf, auth.WithScopes(bqapi.BigqueryScope))
   462  	if err != nil {
   463  		return nil, err
   464  	}
   465  	return bqapi.New(&http.Client{Transport: tr})
   466  }
   467  
   468  // doInsert does the actual BigQuery call.
   469  //
   470  // It is mocked in tests.
   471  func (l *Log) doInsert(ctx context.Context, req *bqapi.TableDataInsertAllRequest) (*bqapi.TableDataInsertAllResponse, error) {
   472  	ctx, cancel := clock.WithTimeout(ctx, 30*time.Second)
   473  	defer cancel()
   474  	logging.Infof(ctx, "Sending %d rows to BigQuery", len(req.Rows))
   475  	bq, err := l.bigQuery(ctx)
   476  	if err != nil {
   477  		return nil, err
   478  	}
   479  	call := bq.Tabledata.InsertAll(l.projID(ctx), l.DatasetID, l.TableID, req)
   480  	return call.Context(ctx).Do()
   481  }
   482  
   483  // asyncFlusher implements parallel flush to BigQuery.
   484  type asyncFlusher struct {
   485  	Context  context.Context // the root context
   486  	TableRef string          // for monitoring metrics
   487  	Insert   func(context.Context, *bqapi.TableDataInsertAllRequest) (*bqapi.TableDataInsertAllResponse, error)
   488  
   489  	index int32 // incremented in each 'sendChunk' call
   490  
   491  	chunks chan chunk
   492  	wg     sync.WaitGroup
   493  
   494  	mu       sync.Mutex
   495  	errs     errors.MultiError // collected errors from all 'sendChunk' ops
   496  	rowsSent int               // total number of rows sent to BigQuery
   497  }
   498  
   499  // chunk is a bunch of pendingBatches flushed together.
   500  type chunk struct {
   501  	Tasks []*taskqueue.Task
   502  	Done  func(context.Context) // called in a goroutine after successful upload
   503  
   504  	index int32 // used only for logging, see sendChunk
   505  }
   506  
   507  // start launches internal goroutines that upload data.
   508  func (f *asyncFlusher) start(numParallel int) {
   509  	f.chunks = make(chan chunk)
   510  	for i := 0; i < numParallel; i++ {
   511  		f.wg.Add(1)
   512  		go func() {
   513  			defer f.wg.Done()
   514  			f.uploaderLoop()
   515  		}()
   516  	}
   517  }
   518  
   519  // waitAll waits for completion of all pending 'sendChunk' calls and stops all
   520  // internal goroutines.
   521  //
   522  // Returns total number of rows sent and all the errors.
   523  func (f *asyncFlusher) waitAll() (int, error) {
   524  	close(f.chunks)
   525  	f.wg.Wait()
   526  	if len(f.errs) == 0 {
   527  		return f.rowsSent, nil
   528  	}
   529  	return f.rowsSent, f.errs
   530  }
   531  
   532  // ctx returns a context to use for logging operations happening to some chunk.
   533  func (f *asyncFlusher) ctx(chunkIndex int32) context.Context {
   534  	return logging.SetField(f.Context, "chunk", chunkIndex)
   535  }
   536  
   537  // sendChunk starts an asynchronous operation to upload data to BigQuery.
   538  //
   539  // Can block if too many parallel uploads are already underway. Panics if called
   540  // before 'start' or after 'waitAll'.
   541  //
   542  // On successful upload it deletes the tasks from Pull Queue.
   543  func (f *asyncFlusher) sendChunk(c chunk) {
   544  	c.index = atomic.AddInt32(&f.index, 1)
   545  	logging.Infof(f.ctx(c.index), "Chunk with %d batches queued", len(c.Tasks))
   546  	f.chunks <- c
   547  }
   548  
   549  // uploaderLoop runs in a separate goroutine.
   550  func (f *asyncFlusher) uploaderLoop() {
   551  	for chunk := range f.chunks {
   552  		ctx := f.ctx(chunk.index)
   553  		logging.Infof(ctx, "Chunk flush starting")
   554  		sent, err := f.upload(ctx, chunk)
   555  		f.mu.Lock()
   556  		if err == nil {
   557  			f.rowsSent += sent
   558  		} else {
   559  			f.errs = append(f.errs, err)
   560  		}
   561  		f.mu.Unlock()
   562  		logging.Infof(ctx, "Chunk flush finished")
   563  	}
   564  }
   565  
   566  // upload sends the rows to BigQuery.
   567  func (f *asyncFlusher) upload(ctx context.Context, chunk chunk) (int, error) {
   568  	// Give up right away if the context is already dead.
   569  	if err := ctx.Err(); err != nil {
   570  		logging.WithError(err).Errorf(ctx, "Skipping upload")
   571  		return 0, err
   572  	}
   573  
   574  	// Collect all pending data into an array of rows. We cheat here when
   575  	// unpacking gob-serialized entries and use 'bigquery.JsonValue' instead of
   576  	// 'biquery.Value{}' in Data. They are compatible. This cheat avoids to do
   577  	// a lot of allocations just to appease the type checker.
   578  	var rows []*bqapi.TableDataInsertAllRequestRows
   579  	var entries []rawEntry
   580  
   581  	for _, task := range chunk.Tasks {
   582  		ctx := logging.SetField(ctx, "name", task.Name)
   583  
   584  		if err := gob.NewDecoder(bytes.NewReader(task.Payload)).Decode(&entries); err != nil {
   585  			logging.WithError(err).Errorf(ctx, "Failed to gob-decode pending batch, it will be skipped")
   586  			continue
   587  		}
   588  
   589  		for i, entry := range entries {
   590  			insertID := entry.InsertID
   591  			if insertID == "" {
   592  				// The task names are autogenerated and guaranteed to be unique.
   593  				// Use them as a base for autogenerated insertID.
   594  				insertID = fmt.Sprintf("bqlog:%s:%d", task.Name, i)
   595  			}
   596  			rows = append(rows, &bqapi.TableDataInsertAllRequestRows{
   597  				InsertId: insertID,
   598  				Json:     entry.Data,
   599  			})
   600  			// We need to nil Data maps here to be able to reuse 'entries' array
   601  			// capacity later. Otherwise gob decode "discovers" maps and overwrites
   602  			// their data in-place, spoiling 'rows'.
   603  			entries[i].Data = nil
   604  		}
   605  
   606  		entries = entries[:0]
   607  	}
   608  
   609  	if len(rows) == 0 {
   610  		chunk.Done(ctx)
   611  		return 0, nil
   612  	}
   613  
   614  	// Now actually send all the entries with retries.
   615  	var lastResp *bqapi.TableDataInsertAllResponse
   616  	taggedCtx := clock.Tag(ctx, "insert-retry") // used by tests
   617  	err := retry.Retry(taggedCtx, transient.Only(f.retryParams), func() error {
   618  		startTime := clock.Now(ctx)
   619  		var err error
   620  		lastResp, err = f.Insert(ctx, &bqapi.TableDataInsertAllRequest{
   621  			SkipInvalidRows: true, // they will be reported in lastResp.InsertErrors
   622  			Rows:            rows,
   623  		})
   624  		code := 0
   625  		status := "ok"
   626  		if gerr, _ := err.(*googleapi.Error); gerr != nil {
   627  			code = gerr.Code
   628  			status = fmt.Sprintf("http_%d", code)
   629  		} else if ctx.Err() != nil {
   630  			status = "timeout"
   631  		} else if err != nil {
   632  			status = "unknown"
   633  		}
   634  		dt := clock.Since(ctx, startTime)
   635  		bigQueryLatency.Add(ctx, float64(dt.Nanoseconds()/1e6), f.TableRef, "insertAll", status)
   636  		if code >= 500 {
   637  			return transient.Tag.Apply(err)
   638  		}
   639  		return err
   640  	}, func(err error, wait time.Duration) {
   641  		logging.Fields{
   642  			logging.ErrorKey: err,
   643  			"wait":           wait,
   644  		}.Warningf(ctx, "Failed to send data to BigQuery")
   645  	})
   646  	if err != nil {
   647  		logging.WithError(err).Errorf(ctx, "Failed to send data to BigQuery")
   648  		if !transient.Tag.In(err) && err != context.DeadlineExceeded {
   649  			chunk.Done(ctx)
   650  		}
   651  		return 0, err
   652  	}
   653  
   654  	if success := len(rows) - len(lastResp.InsertErrors); success > 0 {
   655  		flushedEntryCount.Add(ctx, int64(success), f.TableRef, "ok")
   656  	}
   657  
   658  	if len(lastResp.InsertErrors) != 0 {
   659  		// Use only first error as a sample. Dumping them all is impractical.
   660  		blob, _ := json.MarshalIndent(lastResp.InsertErrors[0].Errors, "", "  ")
   661  		logging.Errorf(ctx, "%d rows weren't accepted, sample error:\n%s", len(lastResp.InsertErrors), blob)
   662  
   663  		// Categorize errors by reason to dump them to monitoring. We look only
   664  		// at first suberror.
   665  		perReason := make(map[string]int64, 5)
   666  		for _, err := range lastResp.InsertErrors {
   667  			reason := "unknown"
   668  			if len(err.Errors) > 0 {
   669  				reason = err.Errors[0].Reason // usually just "invalid"
   670  			}
   671  			perReason[reason]++
   672  		}
   673  		for reason, count := range perReason {
   674  			flushedEntryCount.Add(ctx, count, f.TableRef, reason)
   675  		}
   676  	}
   677  
   678  	chunk.Done(ctx)
   679  	return len(rows), nil
   680  }
   681  
   682  // retryParams defines retry strategy for handling BigQuery transient errors.
   683  func (f *asyncFlusher) retryParams() retry.Iterator {
   684  	return &retry.ExponentialBackoff{
   685  		Limited: retry.Limited{
   686  			Delay:    50 * time.Millisecond,
   687  			Retries:  50,
   688  			MaxTotal: 45 * time.Second,
   689  		},
   690  		Multiplier: 2,
   691  	}
   692  }