go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/logdog/appengine/coordinator/endpoints/services/taskQueue.go (about)

     1  // Copyright 2019 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package services
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"strconv"
    21  	"strings"
    22  	"sync"
    23  	"time"
    24  
    25  	"google.golang.org/grpc/codes"
    26  	"google.golang.org/grpc/status"
    27  	"google.golang.org/protobuf/proto"
    28  	"google.golang.org/protobuf/types/known/emptypb"
    29  
    30  	"go.chromium.org/luci/common/clock"
    31  	"go.chromium.org/luci/common/errors"
    32  	"go.chromium.org/luci/common/logging"
    33  	log "go.chromium.org/luci/common/logging"
    34  	"go.chromium.org/luci/common/retry"
    35  	"go.chromium.org/luci/common/retry/transient"
    36  	"go.chromium.org/luci/common/sync/parallel"
    37  	"go.chromium.org/luci/common/tsmon/field"
    38  	"go.chromium.org/luci/common/tsmon/metric"
    39  	"go.chromium.org/luci/gae/service/datastore"
    40  	ds "go.chromium.org/luci/gae/service/datastore"
    41  	"go.chromium.org/luci/gae/service/info"
    42  	"go.chromium.org/luci/gae/service/taskqueue"
    43  	"go.chromium.org/luci/grpc/grpcutil"
    44  	logdog "go.chromium.org/luci/logdog/api/endpoints/coordinator/services/v1"
    45  	"go.chromium.org/luci/logdog/appengine/coordinator"
    46  )
    47  
    48  // baseArchiveQueueName is the taskqueue queue name which contains Archive tasks.
    49  const baseArchiveQueueName = "archiveTasks"
    50  
    51  // RawArchiveQueueName returns the raw name for the queueNumber'th queue used
    52  // for ArchiveTasks.
    53  func RawArchiveQueueName(queueNumber int32) string {
    54  	if queueNumber == 0 {
    55  		// logdog queues didn't used to be sharded, so keep baseArchiveQueueName for
    56  		// queue 0.
    57  		return baseArchiveQueueName
    58  	}
    59  	return fmt.Sprintf("%s-%d", baseArchiveQueueName, queueNumber)
    60  }
    61  
    62  var (
    63  	metricCreateTask = metric.NewCounter(
    64  		"logdog/archival/create_task",
    65  		"Number of created tasks from the archive queue, as seen by the coordinator",
    66  		nil,
    67  		field.String("project"),
    68  		field.Int("queueNumber"))
    69  	metricLeaseTask = metric.NewCounter(
    70  		"logdog/archival/lease_task",
    71  		"Number of leased tasks from the archive queue, as seen by the coordinator",
    72  		nil,
    73  		field.Int("numRetries"),
    74  		field.String("project"),
    75  		field.Int("queueNumber"))
    76  	metricDeleteTask = metric.NewCounter(
    77  		"logdog/archival/delete_task",
    78  		"Number of delete task request for the archive queue, as seen by the coordinator",
    79  		nil,
    80  		field.String("project"),
    81  		field.Int("queueNumber"))
    82  )
    83  
    84  var (
    85  	taskqueueLeaseRetry = func() retry.Iterator {
    86  		// very simple retry scheme; we only have 60s to service the entire
    87  		// request so we can't let this grow too large.
    88  		return &retry.Limited{
    89  			Delay:   time.Second,
    90  			Retries: 3,
    91  		}
    92  	}
    93  )
    94  
    95  // taskArchival tasks an archival of a stream with the given delay.
    96  func (s *server) taskArchival(c context.Context, state *coordinator.LogStreamState, realm string, delay time.Duration) error {
    97  	// Now task the archival.
    98  	state.Updated = clock.Now(c).UTC()
    99  	state.ArchivalKey = []byte{'1'} // Use a fake key just to signal that we've tasked the archival.
   100  	if err := ds.Put(c, state); err != nil {
   101  		log.Fields{
   102  			log.ErrorKey: err,
   103  		}.Errorf(c, "Failed to Put() LogStream.")
   104  		return status.Error(codes.Internal, "internal server error")
   105  	}
   106  
   107  	project := string(coordinator.ProjectFromNamespace(state.Parent.Namespace()))
   108  	id := string(state.ID())
   109  	t, err := tqTask(&logdog.ArchiveTask{Project: project, Id: id, Realm: realm})
   110  	if err != nil {
   111  		log.WithError(err).Errorf(c, "could not create archival task")
   112  		return status.Error(codes.Internal, "internal server error")
   113  	}
   114  	t.Delay = delay
   115  	queueName, queueNumber := s.getNextArchiveQueueName(c)
   116  
   117  	if err := taskqueue.Add(c, queueName, t); err != nil {
   118  		log.WithError(err).Errorf(c, "could not task archival")
   119  		return status.Error(codes.Internal, "internal server error")
   120  	}
   121  
   122  	metricCreateTask.Add(c, 1, project, queueNumber)
   123  	return nil
   124  }
   125  
   126  // tqTask returns a taskqueue task for an archive task.
   127  func tqTask(task *logdog.ArchiveTask) (*taskqueue.Task, error) {
   128  	if task.TaskName != "" {
   129  		panic("task.TaskName is set while generating task")
   130  	}
   131  
   132  	payload, err := proto.Marshal(task)
   133  	return &taskqueue.Task{
   134  		Payload: payload,
   135  		Method:  "PULL",
   136  	}, err
   137  }
   138  
   139  // tqTaskLeased returns a taskqueue queue name and task from a previously-leased
   140  // archive task.
   141  func tqTaskLeased(task *logdog.ArchiveTask) (queueNumber int32, t *taskqueue.Task, err error) {
   142  	if task.TaskName == "" {
   143  		panic("task.TaskName is unset while deleting task")
   144  	}
   145  
   146  	t = &taskqueue.Task{
   147  		Method: "PULL",
   148  	}
   149  
   150  	toks := strings.Split(task.TaskName, ":")
   151  	switch len(toks) {
   152  	case 1:
   153  		// legacy task: pre-sharding
   154  		queueNumber = 0
   155  		t.Name = task.TaskName
   156  
   157  	case 2:
   158  		var queueNumberInt int
   159  
   160  		if queueNumberInt, err = strconv.Atoi(toks[0]); err != nil {
   161  			err = errors.Annotate(err, "parsing TaskName %q", task.TaskName).Err()
   162  			return
   163  		}
   164  
   165  		queueNumber = int32(queueNumberInt)
   166  		t.Name = toks[1]
   167  
   168  	default:
   169  		err = errors.Reason("unknown TaskName format %q", task.TaskName).Err()
   170  	}
   171  
   172  	return
   173  }
   174  
   175  // archiveTask creates a archiveTask proto from a taskqueue task.
   176  func archiveTask(task *taskqueue.Task, queueNumber int32) (*logdog.ArchiveTask, error) {
   177  	result := logdog.ArchiveTask{}
   178  	err := proto.Unmarshal(task.Payload, &result)
   179  	// NOTE: TaskName here is the LogDog "task name" and is the composite of the
   180  	// queue number we pulled from plus `task.Name` which is the auto-generated
   181  	// task id assigned by the taskqueue service.
   182  	result.TaskName = fmt.Sprintf("%d:%s", queueNumber, task.Name)
   183  	return &result, err
   184  }
   185  
   186  // checkArchived is a best-effort check to see if a task is archived.
   187  // If this fails, an error is logged, and it returns false.
   188  func isArchived(c context.Context, task *logdog.ArchiveTask) bool {
   189  	var err error
   190  	defer func() {
   191  		if err != nil {
   192  			logging.WithError(err).Errorf(c, "while checking if %s/%s is archived", task.Project, task.Id)
   193  		}
   194  	}()
   195  	if c, err = info.Namespace(c, coordinator.ProjectNamespace(task.Project)); err != nil {
   196  		return false
   197  	}
   198  	state := (&coordinator.LogStream{ID: coordinator.HashID(task.Id)}).State(c)
   199  	if err = datastore.Get(c, state); err != nil {
   200  		return false
   201  	}
   202  	return state.ArchivalState() == coordinator.ArchivedComplete
   203  }
   204  
   205  // LeaseArchiveTasks leases archive tasks to the requestor from a pull queue.
   206  func (s *server) LeaseArchiveTasks(c context.Context, req *logdog.LeaseRequest) (*logdog.LeaseResponse, error) {
   207  	if err := req.LeaseTime.CheckValid(); err != nil {
   208  		return nil, err
   209  	}
   210  	duration := req.LeaseTime.AsDuration()
   211  
   212  	var tasks []*taskqueue.Task
   213  	logging.Infof(c, "got request to lease %d tasks for %s", req.MaxTasks, req.GetLeaseTime())
   214  
   215  	queueName, queueNumber := s.getNextArchiveQueueName(c)
   216  	logging.Infof(c, "picked queue %q", queueName)
   217  
   218  	err := retry.Retry(c, taskqueueLeaseRetry,
   219  		func() error {
   220  			var err error
   221  
   222  			tasks, err = taskqueue.Lease(c, int(req.MaxTasks), queueName, duration)
   223  			// TODO(iannucci): There probably should be a better API for this error
   224  			// detection stuff, but since taskqueue is deprecated, I don't think it's
   225  			// worth the effort.
   226  			ann := errors.Annotate(err, "leasing archive tasks")
   227  			if err != nil && strings.Contains(err.Error(), "TRANSIENT_ERROR") {
   228  				ann.Tag(
   229  					grpcutil.ResourceExhaustedTag, // for HTTP response code 429.
   230  					transient.Tag,                 // for retry.Retry.
   231  				)
   232  			}
   233  			return ann.Err()
   234  		},
   235  		retry.LogCallback(c, "taskqueue.Lease"))
   236  	if err != nil {
   237  		return nil, err
   238  	}
   239  
   240  	logging.Infof(c, "got %d raw tasks from taskqueue", len(tasks))
   241  
   242  	var archiveTasksL sync.Mutex
   243  	archiveTasks := make([]*logdog.ArchiveTask, 0, len(tasks))
   244  
   245  	parallel.WorkPool(8, func(ch chan<- func() error) {
   246  		for _, task := range tasks {
   247  			task := task
   248  			at, err := archiveTask(task, queueNumber)
   249  			if err != nil {
   250  				// Ignore malformed name errors, just log them.
   251  				logging.WithError(err).Errorf(c, "while leasing task")
   252  				continue
   253  			}
   254  
   255  			ch <- func() error {
   256  				// Optimization: Delete the task if it's already archived.
   257  				if isArchived(c, at) {
   258  					if err := taskqueue.Delete(c, queueName, task); err != nil {
   259  						logging.WithError(err).Errorf(c, "failed to delete %s/%s (%s)", at.Project, at.Id, task.Name)
   260  					}
   261  					metricDeleteTask.Add(c, 1, at.Project, queueNumber)
   262  				} else {
   263  					// BUG(crbug.com/1323896): This log is here to diagnosis of some weird
   264  					// metrics we were seeing; Metrics reported retry counts of 70+, but
   265  					// we weren't able to actually see which tasks those could be.
   266  					if task.RetryCount > 50 {
   267  						logging.Warningf(
   268  							c, "crbug.com/1323896: High retry task encountered: project:%s, queue#:%d, retries:%d, taskName:%s",
   269  							at.Project, queueNumber, task.RetryCount, task.Name)
   270  					}
   271  
   272  					archiveTasksL.Lock()
   273  					defer archiveTasksL.Unlock()
   274  					archiveTasks = append(archiveTasks, at)
   275  					metricLeaseTask.Add(c, 1, task.RetryCount, at.Project, queueNumber)
   276  				}
   277  				return nil
   278  			}
   279  		}
   280  	})
   281  
   282  	logging.Infof(c, "Leasing %d tasks", len(archiveTasks))
   283  	return &logdog.LeaseResponse{Tasks: archiveTasks}, nil
   284  }
   285  
   286  // DeleteArchiveTasks deletes archive tasks from the task queue.
   287  // Errors are logged but ignored.
   288  func (s *server) DeleteArchiveTasks(c context.Context, req *logdog.DeleteRequest) (*emptypb.Empty, error) {
   289  	// Although we only ever issue archival tasks from a single queue, this RPC
   290  	// doesn't stipulate that all tasks must belong to the same queue.
   291  	tasksPerQueue := map[int32][]*taskqueue.Task{}
   292  	for _, at := range req.Tasks {
   293  		queueNumber, tqTask, err := tqTaskLeased(at)
   294  		if err != nil {
   295  			return nil, err
   296  		}
   297  
   298  		metricDeleteTask.Add(c, 1, at.Project, queueNumber)
   299  		tasksPerQueue[queueNumber] = append(tasksPerQueue[queueNumber], tqTask)
   300  	}
   301  
   302  	return &emptypb.Empty{}, parallel.WorkPool(8, func(ch chan<- func() error) {
   303  		for queueNumber, tasks := range tasksPerQueue {
   304  			queueNumber, tasks := queueNumber, tasks
   305  			queueName := RawArchiveQueueName(queueNumber)
   306  
   307  			ch <- func() error {
   308  				logging.Infof(c, "Deleting %d tasks from %q", len(tasks), queueName)
   309  				if err := taskqueue.Delete(c, queueName, tasks...); err != nil {
   310  					logging.WithError(err).Errorf(c, "while deleting tasks\n%#v", tasks)
   311  				}
   312  				return nil
   313  			}
   314  		}
   315  	})
   316  }