go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/appengine/bqlog/bqlog.go (about) 1 // Copyright 2016 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package bqlog provides a mechanism to asynchronously log rows to BigQuery. 16 // 17 // Deprecated: this package depends on Pull Task Queues which are deprecated and 18 // not available from the GAE second-gen runtime or from Kubernetes. The 19 // replacement is go.chromium.org/luci/server/tq PubSub tasks, plus a PubSub 20 // push subscription with a handler that simply inserts rows into BigQuery. 21 package bqlog 22 23 import ( 24 "bytes" 25 "context" 26 "encoding/gob" 27 "encoding/json" 28 "fmt" 29 "net/http" 30 "strings" 31 "sync" 32 "sync/atomic" 33 "time" 34 35 "cloud.google.com/go/bigquery" 36 bqapi "google.golang.org/api/bigquery/v2" 37 "google.golang.org/api/googleapi" 38 39 "go.chromium.org/luci/gae/service/info" 40 "go.chromium.org/luci/gae/service/taskqueue" 41 42 "go.chromium.org/luci/common/clock" 43 "go.chromium.org/luci/common/errors" 44 "go.chromium.org/luci/common/logging" 45 "go.chromium.org/luci/common/retry" 46 "go.chromium.org/luci/common/retry/transient" 47 48 "go.chromium.org/luci/common/tsmon/distribution" 49 "go.chromium.org/luci/common/tsmon/field" 50 "go.chromium.org/luci/common/tsmon/metric" 51 "go.chromium.org/luci/common/tsmon/types" 52 53 "go.chromium.org/luci/server/auth" 54 ) 55 56 const ( 57 defaultBatchesPerRequest = 250 58 defaultMaxParallelUploads = 64 59 defaultFlushTimeout = time.Minute 60 ) 61 62 var ( 63 // This can be used to estimate how many events are produced. 64 insertedEntryCount = metric.NewCounter( 65 "luci/bqlog/inserted_entry_count", 66 "Total number of log entries successfully added in Insert(...).", 67 nil, 68 field.String("table")) // "<projID>/<datasetID>/<tableID>" 69 70 // To track the performance of Insert(...). 71 insertLatency = metric.NewCumulativeDistribution( 72 "luci/bqlog/insert_latency", 73 "Distribution of Insert(...) call latencies.", 74 &types.MetricMetadata{Units: types.Milliseconds}, 75 distribution.DefaultBucketer, 76 field.String("table"), // "<projID>/<datasetID>/<tableID>" 77 field.String("status")) // "ok" or "fail" 78 79 // To track the performance of 'Flush'. 80 flushLatency = metric.NewCumulativeDistribution( 81 "luci/bqlog/flush_latency", 82 "Distribution of Flush(...) call latencies.", 83 &types.MetricMetadata{Units: types.Milliseconds}, 84 distribution.DefaultBucketer, 85 field.String("table"), // "<projID>/<datasetID>/<tableID>" 86 field.String("status")) // "ok", "fail" or "warning" 87 88 // This is perhaps the most important metric, since it shows a number of rows 89 // skipped during the flush due to schema mismatch or other BigQuery errors. 90 flushedEntryCount = metric.NewCounter( 91 "luci/bqlog/flushed_entry_count", 92 "Total number of rows sent to BigQuery (including rejected rows).", 93 nil, 94 field.String("table"), // "<projID>/<datasetID>/<tableID>" 95 field.String("status")) // "ok" or whatever error reason BigQuery returns 96 97 // Stats of individual BigQuery API calls (including retries). 98 bigQueryLatency = metric.NewCumulativeDistribution( 99 "luci/bqlog/bigquery_latency", 100 "Distribution of BigQuery API call latencies.", 101 &types.MetricMetadata{Units: types.Milliseconds}, 102 distribution.DefaultBucketer, 103 field.String("table"), // "<projID>/<datasetID>/<tableID>" 104 field.String("method"), // name of the API method, e.g. "insertAll" 105 field.String("status")) // "ok, "http_400", ..., "timeout" or "unknown" 106 107 // This can be used to estimate a queuing backlog. 108 pullQueueLen = metric.NewInt( 109 "luci/bqlog/pullqueue_len", 110 "Number of tasks in the associated Pull Queue (prior to Flush call).", 111 nil, 112 field.String("table")) // "<projID>/<datasetID>/<tableID>" 113 114 // This estimates queuing delay and any GAE scheduling hickups. 115 pullQueueLatency = metric.NewFloat( 116 "luci/bqlog/pullqueue_latency", 117 "Age of the oldest task in the queue or 0 if the queue is empty.", 118 &types.MetricMetadata{Units: types.Milliseconds}, 119 field.String("table")) // "<projID>/<datasetID>/<tableID>" 120 ) 121 122 // Log can be used to insert entries into a BigQuery table. 123 type Log struct { 124 // QueueName is a name of a pull queue to use as a buffer for inserts. 125 // 126 // Required. It must be defined in queue.yaml file and it must not be used by 127 // any other Log object. 128 QueueName string 129 130 // ProjectID is Cloud Project that owns the dataset. 131 // 132 // If empty, will be derived from the current app ID. 133 ProjectID string 134 135 // DatasetID identifies the already existing dataset that contains the table. 136 // 137 // Required. 138 DatasetID string 139 140 // TableID identifies the name of the table in the dataset. 141 // 142 // Required. The table must exist already. 143 TableID string 144 145 // BatchesPerRequest is how many batches of entries to send in one BQ insert. 146 // 147 // A call to 'Insert' generates one batch of entries, thus BatchesPerRequest 148 // essentially specifies how many 'Insert's to clump together when sending 149 // data to BigQuery. If your Inserts are known to be huge, lowering this value 150 // may help to avoid hitting memory limits. 151 // 152 // Default is 250. It assumes your batches are very small (1-3 rows), which 153 // is usually the case if events are generated by online RPC handlers. 154 BatchesPerRequest int 155 156 // MaxParallelUploads is how many parallel ops to do when flushing. 157 // 158 // We limit it to avoid hitting OOM errors on GAE. 159 // 160 // Default is 64. 161 MaxParallelUploads int 162 163 // FlushTimeout is maximum duration to spend in fetching from Pull Queue in 164 // 'Flush'. 165 // 166 // We limit it to make sure 'Flush' has a chance to finish running before 167 // GAE kills it by deadline. Next time 'Flush' is called, it will resume 168 // flushing from where it left off. 169 // 170 // Note that 'Flush' can run for slightly longer, since it waits for all 171 // pulled data to be flushed before returning. 172 // 173 // Default is 1 min. 174 FlushTimeout time.Duration 175 176 // DumpEntriesToLogger makes 'Insert' log all entries (at debug level). 177 DumpEntriesToLogger bool 178 179 // DryRun disables the actual uploads (keeps the local logging though). 180 DryRun bool 181 182 // insertMock is used to mock BigQuery insertAll call in tests. 183 insertMock func(context.Context, *bqapi.TableDataInsertAllRequest) (*bqapi.TableDataInsertAllResponse, error) 184 // beforeSendChunk is used in tests to signal that 'sendChunk' is called. 185 beforeSendChunk func(context.Context, []*taskqueue.Task) 186 } 187 188 // rawEntry is a single structured entry in the log. 189 // 190 // It gets gob-serialized and put into Task Queue. 191 type rawEntry struct { 192 InsertID string 193 Data map[string]bqapi.JsonValue // more like map[string]json.RawMessage 194 } 195 196 func init() { 197 gob.Register(json.RawMessage{}) 198 } 199 200 // valuesToJSON converts bigquery.Value map to a simplest JSON-marshallable 201 // bqapi.JsonValue map. 202 // 203 // bigquery.Value can actually be pretty complex (e.g. time.Time, big.Rat), 204 // which makes it difficult to put into gob-encodable rawEntry. 205 // 206 // So instead we convert it to JSON before putting into Gob (bqapi library does 207 // it later anyway). But since BigQuery API wants a map[string]bqapi.JsonValue 208 // we return map[string]json.RawMessage instead of complete raw []byte. 209 func valuesToJSON(in map[string]bigquery.Value) (map[string]bqapi.JsonValue, error) { 210 if len(in) == 0 { 211 return nil, nil 212 } 213 out := make(map[string]bqapi.JsonValue, len(in)) 214 for k, v := range in { 215 blob, err := json.Marshal(v) 216 if err != nil { 217 return nil, errors.Annotate(err, "failed to JSON-serialize key %q", k).Err() 218 } 219 out[k] = json.RawMessage(blob) 220 } 221 return out, nil 222 } 223 224 // Insert adds a bunch of entries to the buffer of pending entries. 225 // 226 // It will reuse existing datastore transaction (if any). This allows to 227 // log entries transactionally when changing something in the datastore. 228 // 229 // Empty inserts IDs will be replaced with autogenerated ones (they start with 230 // 'bqlog:'). Entries not matching the schema are logged and skipped during 231 // the flush. 232 func (l *Log) Insert(ctx context.Context, rows ...bigquery.ValueSaver) (err error) { 233 if len(rows) == 0 { 234 return nil 235 } 236 237 entries := make([]rawEntry, len(rows)) 238 for i, r := range rows { 239 values, iid, err := r.Save() 240 if err != nil { 241 return errors.Annotate(err, "failure when saving row #%d", i).Err() 242 } 243 if entries[i].Data, err = valuesToJSON(values); err != nil { 244 return errors.Annotate(err, "failure when serializing row #%d", i).Err() 245 } 246 entries[i].InsertID = iid 247 } 248 249 if l.DumpEntriesToLogger && logging.IsLogging(ctx, logging.Debug) { 250 for idx, entry := range entries { 251 blob, err := json.MarshalIndent(entry.Data, "", " ") 252 if err != nil { 253 logging.WithError(err).Errorf(ctx, "Failed to serialize the row #%d", idx) 254 } else { 255 logging.Debugf(ctx, "BigQuery row #%d for %s/%s:\n%s", idx, l.DatasetID, l.TableID, blob) 256 } 257 } 258 } 259 260 if l.DryRun { 261 return nil 262 } 263 264 // We need tableRef to report the metrics, thus an error to get tableRef is 265 // NOT reported to tsmon. It happens only if TableID or DatasetID are 266 // malformed. 267 tableRef, err := l.tableRef(ctx) 268 if err != nil { 269 return err 270 } 271 272 startTime := clock.Now(ctx) 273 defer func() { 274 dt := clock.Since(ctx, startTime) 275 status := "fail" 276 if err == nil { 277 status = "ok" 278 insertedEntryCount.Add(ctx, int64(len(entries)), tableRef) 279 } 280 insertLatency.Add(ctx, float64(dt.Nanoseconds()/1e6), tableRef, status) 281 }() 282 283 buf := bytes.Buffer{} 284 if err := gob.NewEncoder(&buf).Encode(entries); err != nil { 285 return err 286 } 287 return taskqueue.Add(ctx, l.QueueName, &taskqueue.Task{ 288 Method: "PULL", 289 Payload: buf.Bytes(), 290 }) 291 } 292 293 // Flush pulls buffered rows from Pull Queue and sends them to BigQuery. 294 // 295 // Must be called periodically from some cron job. It is okay to call 'Flush' 296 // concurrently from multiple processes to speed up the upload. 297 // 298 // It succeeds if all entries it attempted to send were successfully handled by 299 // BigQuery. If some entries are malformed, it logs the error and skip them, 300 // so they don't get stuck in the pending buffer forever. This corresponds to 301 // 'skipInvalidRows=true' in 'insertAll' BigQuery call. 302 // 303 // Returns number of rows sent to BigQuery. May return both non zero number of 304 // rows and an error if something bad happened midway. 305 func (l *Log) Flush(ctx context.Context) (int, error) { 306 tableRef, err := l.tableRef(ctx) 307 if err != nil { 308 return 0, err 309 } 310 ctx = logging.SetFields(ctx, logging.Fields{"table": tableRef}) 311 logging.Infof(ctx, "Flush started") 312 313 startTime := clock.Now(ctx) 314 315 softDeadline := startTime.Add(l.flushTimeout()) // when to stop pulling tasks 316 hardDeadline := softDeadline.Add(time.Minute) // when to abort all calls 317 318 softDeadlineCtx, cancel := clock.WithDeadline(ctx, softDeadline) 319 defer cancel() 320 hardDeadlineCtx, cancel := clock.WithDeadline(ctx, hardDeadline) 321 defer cancel() 322 323 stats, err := taskqueue.Stats(ctx, l.QueueName) 324 if err != nil { 325 logging.WithError(err).Warningf(ctx, "Failed to query stats of queue %q", l.QueueName) 326 } else { 327 var age time.Duration 328 if eta := stats[0].OldestETA; !eta.IsZero() { 329 age = clock.Now(ctx).Sub(eta) 330 } 331 pullQueueLatency.Set(ctx, float64(age.Nanoseconds()/1e6), tableRef) 332 pullQueueLen.Set(ctx, int64(stats[0].Tasks), tableRef) 333 } 334 335 // Lease pending upload tasks, split them into 'BatchesPerRequest' chunks, 336 // upload all chunks in parallel (limiting the number of concurrent 337 // uploads). 338 flusher := asyncFlusher{ 339 Context: hardDeadlineCtx, 340 TableRef: tableRef, 341 Insert: l.insert, 342 } 343 flusher.start(l.maxParallelUploads()) 344 345 // We lease batches until we run out of time or there's nothing more to lease. 346 // On errors or RPC deadlines we slow down, but carry on. We lease until hard 347 // deadline. Note that losing a lease is not a catastrophic event: BigQuery 348 // still should be able to remove duplicates based on insertID. 349 var lastLeaseErr error 350 sleep := time.Second 351 for clock.Now(ctx).Before(softDeadline) { 352 rpcCtx, cancel := clock.WithTimeout(softDeadlineCtx, 15*time.Second) // RPC timeout 353 tasks, err := taskqueue.Lease(rpcCtx, l.batchesPerRequest(), l.QueueName, hardDeadline.Sub(clock.Now(ctx))) 354 cancel() 355 if err != nil { 356 lastLeaseErr = err 357 if clock.Now(ctx).Add(sleep).After(softDeadline) { 358 logging.Warningf(ctx, "Error while leasing, giving up: %s", err) 359 break 360 } 361 logging.Warningf(ctx, "Error while leasing, sleeping %s: %s", err, sleep) 362 clock.Sleep(clock.Tag(softDeadlineCtx, "lease-retry"), sleep) 363 sleep *= 2 364 continue 365 } 366 sleep = time.Second 367 if len(tasks) == 0 { 368 break 369 } 370 if l.beforeSendChunk != nil { 371 l.beforeSendChunk(ctx, tasks) 372 } 373 flusher.sendChunk(chunk{ 374 Tasks: tasks, 375 Done: func(ctx context.Context) { 376 logging.Infof(ctx, "Deleting %d tasks from the task queue", len(tasks)) 377 ctx, cancel := clock.WithTimeout(ctx, 30*time.Second) // RPC timeout 378 defer cancel() 379 if err := taskqueue.Delete(ctx, l.QueueName, tasks...); err != nil { 380 logging.WithError(err).Errorf(ctx, "Failed to delete some tasks") 381 } 382 }, 383 }) 384 } 385 386 sent, err := flusher.waitAll() 387 logging.Infof(ctx, "Flush finished, sent %d rows", sent) 388 389 if err == nil { 390 err = lastLeaseErr 391 } 392 393 dt := clock.Since(ctx, startTime) 394 status := "ok" 395 switch { 396 case err != nil && sent == 0: 397 status = "fail" 398 case err != nil && sent != 0: 399 status = "warning" 400 } 401 flushLatency.Add(ctx, float64(dt.Nanoseconds()/1e6), tableRef, status) 402 403 return sent, err 404 } 405 406 func (l *Log) batchesPerRequest() int { 407 if l.BatchesPerRequest > 0 { 408 return l.BatchesPerRequest 409 } 410 return defaultBatchesPerRequest 411 } 412 413 func (l *Log) maxParallelUploads() int { 414 if l.MaxParallelUploads > 0 { 415 return l.MaxParallelUploads 416 } 417 return defaultMaxParallelUploads 418 } 419 420 func (l *Log) flushTimeout() time.Duration { 421 if l.FlushTimeout > 0 { 422 return l.FlushTimeout 423 } 424 return defaultFlushTimeout 425 } 426 427 func (l *Log) insert(ctx context.Context, r *bqapi.TableDataInsertAllRequest) (*bqapi.TableDataInsertAllResponse, error) { 428 if l.insertMock != nil { 429 return l.insertMock(ctx, r) 430 } 431 return l.doInsert(ctx, r) 432 } 433 434 // projID returns ProjectID or a GAE app ID if ProjectID is "". 435 func (l *Log) projID(ctx context.Context) string { 436 if l.ProjectID == "" { 437 return info.TrimmedAppID(ctx) 438 } 439 return l.ProjectID 440 } 441 442 // tableRef returns an identifier of the table in BigQuery. 443 // 444 // Returns an error if Log is misconfigred. 445 func (l *Log) tableRef(ctx context.Context) (string, error) { 446 projID := l.projID(ctx) 447 if projID == "" || strings.ContainsRune(projID, '/') { 448 return "", fmt.Errorf("invalid project ID %q", projID) 449 } 450 if l.DatasetID == "" || strings.ContainsRune(l.DatasetID, '/') { 451 return "", fmt.Errorf("invalid dataset ID %q", l.DatasetID) 452 } 453 if l.TableID == "" || strings.ContainsRune(l.TableID, '/') { 454 return "", fmt.Errorf("invalid table ID %q", l.TableID) 455 } 456 return fmt.Sprintf("%s/%s/%s", projID, l.DatasetID, l.TableID), nil 457 } 458 459 // bigQuery constructs an instance of BigQuery API client with proper auth. 460 func (l *Log) bigQuery(ctx context.Context) (*bqapi.Service, error) { 461 tr, err := auth.GetRPCTransport(ctx, auth.AsSelf, auth.WithScopes(bqapi.BigqueryScope)) 462 if err != nil { 463 return nil, err 464 } 465 return bqapi.New(&http.Client{Transport: tr}) 466 } 467 468 // doInsert does the actual BigQuery call. 469 // 470 // It is mocked in tests. 471 func (l *Log) doInsert(ctx context.Context, req *bqapi.TableDataInsertAllRequest) (*bqapi.TableDataInsertAllResponse, error) { 472 ctx, cancel := clock.WithTimeout(ctx, 30*time.Second) 473 defer cancel() 474 logging.Infof(ctx, "Sending %d rows to BigQuery", len(req.Rows)) 475 bq, err := l.bigQuery(ctx) 476 if err != nil { 477 return nil, err 478 } 479 call := bq.Tabledata.InsertAll(l.projID(ctx), l.DatasetID, l.TableID, req) 480 return call.Context(ctx).Do() 481 } 482 483 // asyncFlusher implements parallel flush to BigQuery. 484 type asyncFlusher struct { 485 Context context.Context // the root context 486 TableRef string // for monitoring metrics 487 Insert func(context.Context, *bqapi.TableDataInsertAllRequest) (*bqapi.TableDataInsertAllResponse, error) 488 489 index int32 // incremented in each 'sendChunk' call 490 491 chunks chan chunk 492 wg sync.WaitGroup 493 494 mu sync.Mutex 495 errs errors.MultiError // collected errors from all 'sendChunk' ops 496 rowsSent int // total number of rows sent to BigQuery 497 } 498 499 // chunk is a bunch of pendingBatches flushed together. 500 type chunk struct { 501 Tasks []*taskqueue.Task 502 Done func(context.Context) // called in a goroutine after successful upload 503 504 index int32 // used only for logging, see sendChunk 505 } 506 507 // start launches internal goroutines that upload data. 508 func (f *asyncFlusher) start(numParallel int) { 509 f.chunks = make(chan chunk) 510 for i := 0; i < numParallel; i++ { 511 f.wg.Add(1) 512 go func() { 513 defer f.wg.Done() 514 f.uploaderLoop() 515 }() 516 } 517 } 518 519 // waitAll waits for completion of all pending 'sendChunk' calls and stops all 520 // internal goroutines. 521 // 522 // Returns total number of rows sent and all the errors. 523 func (f *asyncFlusher) waitAll() (int, error) { 524 close(f.chunks) 525 f.wg.Wait() 526 if len(f.errs) == 0 { 527 return f.rowsSent, nil 528 } 529 return f.rowsSent, f.errs 530 } 531 532 // ctx returns a context to use for logging operations happening to some chunk. 533 func (f *asyncFlusher) ctx(chunkIndex int32) context.Context { 534 return logging.SetField(f.Context, "chunk", chunkIndex) 535 } 536 537 // sendChunk starts an asynchronous operation to upload data to BigQuery. 538 // 539 // Can block if too many parallel uploads are already underway. Panics if called 540 // before 'start' or after 'waitAll'. 541 // 542 // On successful upload it deletes the tasks from Pull Queue. 543 func (f *asyncFlusher) sendChunk(c chunk) { 544 c.index = atomic.AddInt32(&f.index, 1) 545 logging.Infof(f.ctx(c.index), "Chunk with %d batches queued", len(c.Tasks)) 546 f.chunks <- c 547 } 548 549 // uploaderLoop runs in a separate goroutine. 550 func (f *asyncFlusher) uploaderLoop() { 551 for chunk := range f.chunks { 552 ctx := f.ctx(chunk.index) 553 logging.Infof(ctx, "Chunk flush starting") 554 sent, err := f.upload(ctx, chunk) 555 f.mu.Lock() 556 if err == nil { 557 f.rowsSent += sent 558 } else { 559 f.errs = append(f.errs, err) 560 } 561 f.mu.Unlock() 562 logging.Infof(ctx, "Chunk flush finished") 563 } 564 } 565 566 // upload sends the rows to BigQuery. 567 func (f *asyncFlusher) upload(ctx context.Context, chunk chunk) (int, error) { 568 // Give up right away if the context is already dead. 569 if err := ctx.Err(); err != nil { 570 logging.WithError(err).Errorf(ctx, "Skipping upload") 571 return 0, err 572 } 573 574 // Collect all pending data into an array of rows. We cheat here when 575 // unpacking gob-serialized entries and use 'bigquery.JsonValue' instead of 576 // 'biquery.Value{}' in Data. They are compatible. This cheat avoids to do 577 // a lot of allocations just to appease the type checker. 578 var rows []*bqapi.TableDataInsertAllRequestRows 579 var entries []rawEntry 580 581 for _, task := range chunk.Tasks { 582 ctx := logging.SetField(ctx, "name", task.Name) 583 584 if err := gob.NewDecoder(bytes.NewReader(task.Payload)).Decode(&entries); err != nil { 585 logging.WithError(err).Errorf(ctx, "Failed to gob-decode pending batch, it will be skipped") 586 continue 587 } 588 589 for i, entry := range entries { 590 insertID := entry.InsertID 591 if insertID == "" { 592 // The task names are autogenerated and guaranteed to be unique. 593 // Use them as a base for autogenerated insertID. 594 insertID = fmt.Sprintf("bqlog:%s:%d", task.Name, i) 595 } 596 rows = append(rows, &bqapi.TableDataInsertAllRequestRows{ 597 InsertId: insertID, 598 Json: entry.Data, 599 }) 600 // We need to nil Data maps here to be able to reuse 'entries' array 601 // capacity later. Otherwise gob decode "discovers" maps and overwrites 602 // their data in-place, spoiling 'rows'. 603 entries[i].Data = nil 604 } 605 606 entries = entries[:0] 607 } 608 609 if len(rows) == 0 { 610 chunk.Done(ctx) 611 return 0, nil 612 } 613 614 // Now actually send all the entries with retries. 615 var lastResp *bqapi.TableDataInsertAllResponse 616 taggedCtx := clock.Tag(ctx, "insert-retry") // used by tests 617 err := retry.Retry(taggedCtx, transient.Only(f.retryParams), func() error { 618 startTime := clock.Now(ctx) 619 var err error 620 lastResp, err = f.Insert(ctx, &bqapi.TableDataInsertAllRequest{ 621 SkipInvalidRows: true, // they will be reported in lastResp.InsertErrors 622 Rows: rows, 623 }) 624 code := 0 625 status := "ok" 626 if gerr, _ := err.(*googleapi.Error); gerr != nil { 627 code = gerr.Code 628 status = fmt.Sprintf("http_%d", code) 629 } else if ctx.Err() != nil { 630 status = "timeout" 631 } else if err != nil { 632 status = "unknown" 633 } 634 dt := clock.Since(ctx, startTime) 635 bigQueryLatency.Add(ctx, float64(dt.Nanoseconds()/1e6), f.TableRef, "insertAll", status) 636 if code >= 500 { 637 return transient.Tag.Apply(err) 638 } 639 return err 640 }, func(err error, wait time.Duration) { 641 logging.Fields{ 642 logging.ErrorKey: err, 643 "wait": wait, 644 }.Warningf(ctx, "Failed to send data to BigQuery") 645 }) 646 if err != nil { 647 logging.WithError(err).Errorf(ctx, "Failed to send data to BigQuery") 648 if !transient.Tag.In(err) && err != context.DeadlineExceeded { 649 chunk.Done(ctx) 650 } 651 return 0, err 652 } 653 654 if success := len(rows) - len(lastResp.InsertErrors); success > 0 { 655 flushedEntryCount.Add(ctx, int64(success), f.TableRef, "ok") 656 } 657 658 if len(lastResp.InsertErrors) != 0 { 659 // Use only first error as a sample. Dumping them all is impractical. 660 blob, _ := json.MarshalIndent(lastResp.InsertErrors[0].Errors, "", " ") 661 logging.Errorf(ctx, "%d rows weren't accepted, sample error:\n%s", len(lastResp.InsertErrors), blob) 662 663 // Categorize errors by reason to dump them to monitoring. We look only 664 // at first suberror. 665 perReason := make(map[string]int64, 5) 666 for _, err := range lastResp.InsertErrors { 667 reason := "unknown" 668 if len(err.Errors) > 0 { 669 reason = err.Errors[0].Reason // usually just "invalid" 670 } 671 perReason[reason]++ 672 } 673 for reason, count := range perReason { 674 flushedEntryCount.Add(ctx, count, f.TableRef, reason) 675 } 676 } 677 678 chunk.Done(ctx) 679 return len(rows), nil 680 } 681 682 // retryParams defines retry strategy for handling BigQuery transient errors. 683 func (f *asyncFlusher) retryParams() retry.Iterator { 684 return &retry.ExponentialBackoff{ 685 Limited: retry.Limited{ 686 Delay: 50 * time.Millisecond, 687 Retries: 50, 688 MaxTotal: 45 * time.Second, 689 }, 690 Multiplier: 2, 691 } 692 }