go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/swarming/server/bq/export.go (about)

     1  // Copyright 2023 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //	http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package bq
    16  
    17  import (
    18  	"context"
    19  	"errors"
    20  	"fmt"
    21  	"time"
    22  
    23  	"golang.org/x/sync/errgroup"
    24  	"google.golang.org/protobuf/proto"
    25  	"google.golang.org/protobuf/types/known/durationpb"
    26  	"google.golang.org/protobuf/types/known/timestamppb"
    27  
    28  	"go.chromium.org/luci/common/clock"
    29  	"go.chromium.org/luci/common/logging"
    30  	"go.chromium.org/luci/gae/service/datastore"
    31  	"go.chromium.org/luci/server/tq"
    32  
    33  	"go.chromium.org/luci/swarming/server/bq/taskspb"
    34  )
    35  
    36  // exportDuration is the duration of the time interval to export to bigquery.
    37  const exportDuration = 15 * time.Second
    38  
    39  // maxTasksToSchedule is the maximum number of export tasks which may be
    40  // scheduled per cron job.
    41  const maxTasksToSchedule = 20
    42  
    43  // latestAge represents the latest time in the past which can be scheduled for
    44  // export by ScheduleExportTasks
    45  const latestAge = 2 * time.Minute
    46  
    47  // maxExportStateAge is the amount of time before an ExportState is garbage
    48  // collected.
    49  const maxExportStateAge = 24 * time.Hour
    50  
    51  func RegisterTQTasks() {
    52  	tq.RegisterTaskClass(tq.TaskClass{
    53  		ID:        "bq-export-interval",
    54  		Kind:      tq.NonTransactional,
    55  		Prototype: &taskspb.CreateExportTask{},
    56  		Queue:     "bq-export-interval",
    57  		Handler: func(ctx context.Context, payload proto.Message) error {
    58  			return exportTask(ctx, payload.(*taskspb.CreateExportTask))
    59  		},
    60  	})
    61  }
    62  
    63  func tableID(cloudProject, dataset, tableName string) string {
    64  	return fmt.Sprintf("%s.%s.%s", cloudProject, dataset, tableName)
    65  }
    66  
    67  // CleanupExportState deletes export states which are older than
    68  // maxExportStateAge.
    69  func CleanupExportState(ctx context.Context) error {
    70  	// ScheduleExportTasks runs every 1m
    71  	// * schedules 4 exports per minute
    72  	// * on 4 tables
    73  	const batchSize = 4 * 4 * 10
    74  	// Will need to tune this value
    75  	const nWorkers = 64
    76  	g := new(errgroup.Group)
    77  	g.SetLimit(nWorkers)
    78  
    79  	now := clock.Now(ctx).UTC()
    80  	cutoff := now.Add(-maxExportStateAge)
    81  	logging.Infof(ctx, "Deleting ExportState created earlier than %s", cutoff)
    82  	q := datastore.NewQuery(exportStateKind).Lte("CreatedAt", cutoff)
    83  
    84  	deleteBatch := func(batch []*datastore.Key) {
    85  		g.Go(func() error {
    86  			logging.Debugf(ctx, "Attempting delete of %d ExportStates", len(batch))
    87  			return datastore.Delete(ctx, batch)
    88  		})
    89  	}
    90  
    91  	// RunInBatch works sequentially, so we can use closure to store the
    92  	// current batch.
    93  	batch := make([]*datastore.Key, 0, batchSize)
    94  	err := datastore.RunBatch(ctx, batchSize, q, func(key *datastore.Key) {
    95  		batch = append(batch, key)
    96  		if len(batch) == batchSize {
    97  			deleteBatch(batch)
    98  			batch = make([]*datastore.Key, 0, batchSize)
    99  		}
   100  	})
   101  	// Whatever is left of batches gets deleted in this call.
   102  	deleteBatch(batch)
   103  
   104  	if err != nil {
   105  		logging.Errorf(ctx, "ExportState cleanup query failed")
   106  		// Useful work may still happen in g, in that case wait until its done
   107  		return errors.Join(err, g.Wait())
   108  	}
   109  	return g.Wait()
   110  }
   111  
   112  // ScheduleExportTasks creates a series of tasks responsible for
   113  // exporting a specific time interval to bigquery. All of the TQ tasks scheduled
   114  // will cover the range [NextExport, cutoff). If exports fall behind schedule,
   115  // the scheduler will try and catch up as much as possible by spawning as many
   116  // tasks as possible. A `DuplicationKey` is used to ensure that no duplicate
   117  // tasks are created if there are temporary failures to write to datastore. Will
   118  // schedule a maxium of MaxTasksToSchedule export tasks.
   119  func ScheduleExportTasks(ctx context.Context, cloudProject, dataset, tableName string) error {
   120  	now := clock.Now(ctx).UTC()
   121  	cutoff := now.Add(-latestAge)
   122  	tableID := tableID(cloudProject, dataset, tableName)
   123  	logging.Infof(ctx, "Scheduling export tasks: %s - %s", tableID, cutoff)
   124  	sch := ExportSchedule{Key: exportScheduleKey(ctx, tableName)}
   125  	err := datastore.Get(ctx, &sch)
   126  	if err != nil {
   127  		if errors.Is(err, datastore.ErrNoSuchEntity) {
   128  			sch.NextExport = now.Truncate(time.Minute)
   129  			logging.Infof(ctx, "Creating initial ExportSchedule - %+v", &sch)
   130  			return datastore.Put(ctx, &sch)
   131  		} else {
   132  			return err
   133  		}
   134  	}
   135  	i := 0
   136  	for {
   137  		// At this point, we have generated exports up until the cutoff point
   138  		// Or we have reached maximum number of export tasks to schedule.
   139  		if sch.NextExport.Add(exportDuration).After(cutoff) || i >= maxTasksToSchedule {
   140  			logging.Infof(ctx, "Scheduling export tasks done: %s", sch.NextExport)
   141  			break
   142  		}
   143  		payload := taskspb.CreateExportTask{
   144  			Start:        timestamppb.New(sch.NextExport),
   145  			Duration:     durationpb.New(exportDuration),
   146  			CloudProject: cloudProject,
   147  			Dataset:      dataset,
   148  			TableName:    tableName,
   149  		}
   150  		ts := sch.NextExport.Unix()
   151  		dedupKey := fmt.Sprintf("%s:%d:%d", tableID, ts, exportDuration/time.Second)
   152  		task := tq.Task{
   153  			Title:            dedupKey,
   154  			DeduplicationKey: dedupKey,
   155  			Payload:          &payload,
   156  		}
   157  		logging.Debugf(ctx, "Triggering %s: - %+v",
   158  			dedupKey,
   159  			&payload)
   160  		err = tq.AddTask(ctx, &task)
   161  		if err != nil {
   162  			logging.Warningf(ctx, "Failed to trigger export task: %+v", &payload)
   163  			break
   164  		}
   165  		sch.NextExport = sch.NextExport.Add(exportDuration)
   166  		i += 1
   167  	}
   168  	logging.Infof(ctx, "Updating export schedule after %d iterations: %+v", i, sch)
   169  	return errors.Join(err, datastore.Put(ctx, &sch))
   170  }
   171  
   172  func exportTask(ctx context.Context, t *taskspb.CreateExportTask) error {
   173  	logging.Infof(ctx, "ExportTask started for %s:%s:%d",
   174  		tableID(t.CloudProject, t.Dataset, t.TableName),
   175  		t.Start.AsTime(),
   176  		t.Duration)
   177  	return nil
   178  }