go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/logdog/appengine/coordinator/endpoints/services/taskQueue.go (about) 1 // Copyright 2019 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package services 16 17 import ( 18 "context" 19 "fmt" 20 "strconv" 21 "strings" 22 "sync" 23 "time" 24 25 "google.golang.org/grpc/codes" 26 "google.golang.org/grpc/status" 27 "google.golang.org/protobuf/proto" 28 "google.golang.org/protobuf/types/known/emptypb" 29 30 "go.chromium.org/luci/common/clock" 31 "go.chromium.org/luci/common/errors" 32 "go.chromium.org/luci/common/logging" 33 log "go.chromium.org/luci/common/logging" 34 "go.chromium.org/luci/common/retry" 35 "go.chromium.org/luci/common/retry/transient" 36 "go.chromium.org/luci/common/sync/parallel" 37 "go.chromium.org/luci/common/tsmon/field" 38 "go.chromium.org/luci/common/tsmon/metric" 39 "go.chromium.org/luci/gae/service/datastore" 40 ds "go.chromium.org/luci/gae/service/datastore" 41 "go.chromium.org/luci/gae/service/info" 42 "go.chromium.org/luci/gae/service/taskqueue" 43 "go.chromium.org/luci/grpc/grpcutil" 44 logdog "go.chromium.org/luci/logdog/api/endpoints/coordinator/services/v1" 45 "go.chromium.org/luci/logdog/appengine/coordinator" 46 ) 47 48 // baseArchiveQueueName is the taskqueue queue name which contains Archive tasks. 49 const baseArchiveQueueName = "archiveTasks" 50 51 // RawArchiveQueueName returns the raw name for the queueNumber'th queue used 52 // for ArchiveTasks. 53 func RawArchiveQueueName(queueNumber int32) string { 54 if queueNumber == 0 { 55 // logdog queues didn't used to be sharded, so keep baseArchiveQueueName for 56 // queue 0. 57 return baseArchiveQueueName 58 } 59 return fmt.Sprintf("%s-%d", baseArchiveQueueName, queueNumber) 60 } 61 62 var ( 63 metricCreateTask = metric.NewCounter( 64 "logdog/archival/create_task", 65 "Number of created tasks from the archive queue, as seen by the coordinator", 66 nil, 67 field.String("project"), 68 field.Int("queueNumber")) 69 metricLeaseTask = metric.NewCounter( 70 "logdog/archival/lease_task", 71 "Number of leased tasks from the archive queue, as seen by the coordinator", 72 nil, 73 field.Int("numRetries"), 74 field.String("project"), 75 field.Int("queueNumber")) 76 metricDeleteTask = metric.NewCounter( 77 "logdog/archival/delete_task", 78 "Number of delete task request for the archive queue, as seen by the coordinator", 79 nil, 80 field.String("project"), 81 field.Int("queueNumber")) 82 ) 83 84 var ( 85 taskqueueLeaseRetry = func() retry.Iterator { 86 // very simple retry scheme; we only have 60s to service the entire 87 // request so we can't let this grow too large. 88 return &retry.Limited{ 89 Delay: time.Second, 90 Retries: 3, 91 } 92 } 93 ) 94 95 // taskArchival tasks an archival of a stream with the given delay. 96 func (s *server) taskArchival(c context.Context, state *coordinator.LogStreamState, realm string, delay time.Duration) error { 97 // Now task the archival. 98 state.Updated = clock.Now(c).UTC() 99 state.ArchivalKey = []byte{'1'} // Use a fake key just to signal that we've tasked the archival. 100 if err := ds.Put(c, state); err != nil { 101 log.Fields{ 102 log.ErrorKey: err, 103 }.Errorf(c, "Failed to Put() LogStream.") 104 return status.Error(codes.Internal, "internal server error") 105 } 106 107 project := string(coordinator.ProjectFromNamespace(state.Parent.Namespace())) 108 id := string(state.ID()) 109 t, err := tqTask(&logdog.ArchiveTask{Project: project, Id: id, Realm: realm}) 110 if err != nil { 111 log.WithError(err).Errorf(c, "could not create archival task") 112 return status.Error(codes.Internal, "internal server error") 113 } 114 t.Delay = delay 115 queueName, queueNumber := s.getNextArchiveQueueName(c) 116 117 if err := taskqueue.Add(c, queueName, t); err != nil { 118 log.WithError(err).Errorf(c, "could not task archival") 119 return status.Error(codes.Internal, "internal server error") 120 } 121 122 metricCreateTask.Add(c, 1, project, queueNumber) 123 return nil 124 } 125 126 // tqTask returns a taskqueue task for an archive task. 127 func tqTask(task *logdog.ArchiveTask) (*taskqueue.Task, error) { 128 if task.TaskName != "" { 129 panic("task.TaskName is set while generating task") 130 } 131 132 payload, err := proto.Marshal(task) 133 return &taskqueue.Task{ 134 Payload: payload, 135 Method: "PULL", 136 }, err 137 } 138 139 // tqTaskLeased returns a taskqueue queue name and task from a previously-leased 140 // archive task. 141 func tqTaskLeased(task *logdog.ArchiveTask) (queueNumber int32, t *taskqueue.Task, err error) { 142 if task.TaskName == "" { 143 panic("task.TaskName is unset while deleting task") 144 } 145 146 t = &taskqueue.Task{ 147 Method: "PULL", 148 } 149 150 toks := strings.Split(task.TaskName, ":") 151 switch len(toks) { 152 case 1: 153 // legacy task: pre-sharding 154 queueNumber = 0 155 t.Name = task.TaskName 156 157 case 2: 158 var queueNumberInt int 159 160 if queueNumberInt, err = strconv.Atoi(toks[0]); err != nil { 161 err = errors.Annotate(err, "parsing TaskName %q", task.TaskName).Err() 162 return 163 } 164 165 queueNumber = int32(queueNumberInt) 166 t.Name = toks[1] 167 168 default: 169 err = errors.Reason("unknown TaskName format %q", task.TaskName).Err() 170 } 171 172 return 173 } 174 175 // archiveTask creates a archiveTask proto from a taskqueue task. 176 func archiveTask(task *taskqueue.Task, queueNumber int32) (*logdog.ArchiveTask, error) { 177 result := logdog.ArchiveTask{} 178 err := proto.Unmarshal(task.Payload, &result) 179 // NOTE: TaskName here is the LogDog "task name" and is the composite of the 180 // queue number we pulled from plus `task.Name` which is the auto-generated 181 // task id assigned by the taskqueue service. 182 result.TaskName = fmt.Sprintf("%d:%s", queueNumber, task.Name) 183 return &result, err 184 } 185 186 // checkArchived is a best-effort check to see if a task is archived. 187 // If this fails, an error is logged, and it returns false. 188 func isArchived(c context.Context, task *logdog.ArchiveTask) bool { 189 var err error 190 defer func() { 191 if err != nil { 192 logging.WithError(err).Errorf(c, "while checking if %s/%s is archived", task.Project, task.Id) 193 } 194 }() 195 if c, err = info.Namespace(c, coordinator.ProjectNamespace(task.Project)); err != nil { 196 return false 197 } 198 state := (&coordinator.LogStream{ID: coordinator.HashID(task.Id)}).State(c) 199 if err = datastore.Get(c, state); err != nil { 200 return false 201 } 202 return state.ArchivalState() == coordinator.ArchivedComplete 203 } 204 205 // LeaseArchiveTasks leases archive tasks to the requestor from a pull queue. 206 func (s *server) LeaseArchiveTasks(c context.Context, req *logdog.LeaseRequest) (*logdog.LeaseResponse, error) { 207 if err := req.LeaseTime.CheckValid(); err != nil { 208 return nil, err 209 } 210 duration := req.LeaseTime.AsDuration() 211 212 var tasks []*taskqueue.Task 213 logging.Infof(c, "got request to lease %d tasks for %s", req.MaxTasks, req.GetLeaseTime()) 214 215 queueName, queueNumber := s.getNextArchiveQueueName(c) 216 logging.Infof(c, "picked queue %q", queueName) 217 218 err := retry.Retry(c, taskqueueLeaseRetry, 219 func() error { 220 var err error 221 222 tasks, err = taskqueue.Lease(c, int(req.MaxTasks), queueName, duration) 223 // TODO(iannucci): There probably should be a better API for this error 224 // detection stuff, but since taskqueue is deprecated, I don't think it's 225 // worth the effort. 226 ann := errors.Annotate(err, "leasing archive tasks") 227 if err != nil && strings.Contains(err.Error(), "TRANSIENT_ERROR") { 228 ann.Tag( 229 grpcutil.ResourceExhaustedTag, // for HTTP response code 429. 230 transient.Tag, // for retry.Retry. 231 ) 232 } 233 return ann.Err() 234 }, 235 retry.LogCallback(c, "taskqueue.Lease")) 236 if err != nil { 237 return nil, err 238 } 239 240 logging.Infof(c, "got %d raw tasks from taskqueue", len(tasks)) 241 242 var archiveTasksL sync.Mutex 243 archiveTasks := make([]*logdog.ArchiveTask, 0, len(tasks)) 244 245 parallel.WorkPool(8, func(ch chan<- func() error) { 246 for _, task := range tasks { 247 task := task 248 at, err := archiveTask(task, queueNumber) 249 if err != nil { 250 // Ignore malformed name errors, just log them. 251 logging.WithError(err).Errorf(c, "while leasing task") 252 continue 253 } 254 255 ch <- func() error { 256 // Optimization: Delete the task if it's already archived. 257 if isArchived(c, at) { 258 if err := taskqueue.Delete(c, queueName, task); err != nil { 259 logging.WithError(err).Errorf(c, "failed to delete %s/%s (%s)", at.Project, at.Id, task.Name) 260 } 261 metricDeleteTask.Add(c, 1, at.Project, queueNumber) 262 } else { 263 // BUG(crbug.com/1323896): This log is here to diagnosis of some weird 264 // metrics we were seeing; Metrics reported retry counts of 70+, but 265 // we weren't able to actually see which tasks those could be. 266 if task.RetryCount > 50 { 267 logging.Warningf( 268 c, "crbug.com/1323896: High retry task encountered: project:%s, queue#:%d, retries:%d, taskName:%s", 269 at.Project, queueNumber, task.RetryCount, task.Name) 270 } 271 272 archiveTasksL.Lock() 273 defer archiveTasksL.Unlock() 274 archiveTasks = append(archiveTasks, at) 275 metricLeaseTask.Add(c, 1, task.RetryCount, at.Project, queueNumber) 276 } 277 return nil 278 } 279 } 280 }) 281 282 logging.Infof(c, "Leasing %d tasks", len(archiveTasks)) 283 return &logdog.LeaseResponse{Tasks: archiveTasks}, nil 284 } 285 286 // DeleteArchiveTasks deletes archive tasks from the task queue. 287 // Errors are logged but ignored. 288 func (s *server) DeleteArchiveTasks(c context.Context, req *logdog.DeleteRequest) (*emptypb.Empty, error) { 289 // Although we only ever issue archival tasks from a single queue, this RPC 290 // doesn't stipulate that all tasks must belong to the same queue. 291 tasksPerQueue := map[int32][]*taskqueue.Task{} 292 for _, at := range req.Tasks { 293 queueNumber, tqTask, err := tqTaskLeased(at) 294 if err != nil { 295 return nil, err 296 } 297 298 metricDeleteTask.Add(c, 1, at.Project, queueNumber) 299 tasksPerQueue[queueNumber] = append(tasksPerQueue[queueNumber], tqTask) 300 } 301 302 return &emptypb.Empty{}, parallel.WorkPool(8, func(ch chan<- func() error) { 303 for queueNumber, tasks := range tasksPerQueue { 304 queueNumber, tasks := queueNumber, tasks 305 queueName := RawArchiveQueueName(queueNumber) 306 307 ch <- func() error { 308 logging.Infof(c, "Deleting %d tasks from %q", len(tasks), queueName) 309 if err := taskqueue.Delete(c, queueName, tasks...); err != nil { 310 logging.WithError(err).Errorf(c, "while deleting tasks\n%#v", tasks) 311 } 312 return nil 313 } 314 } 315 }) 316 }