go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/gerrit/poller/poller.go (about) 1 // Copyright 2020 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package poller implements stateful Gerrit polling. 16 package poller 17 18 import ( 19 "context" 20 "fmt" 21 "time" 22 23 "google.golang.org/protobuf/proto" 24 "google.golang.org/protobuf/types/known/timestamppb" 25 26 "go.chromium.org/luci/common/clock" 27 "go.chromium.org/luci/common/errors" 28 "go.chromium.org/luci/common/logging" 29 "go.chromium.org/luci/common/retry/transient" 30 "go.chromium.org/luci/common/sync/parallel" 31 "go.chromium.org/luci/gae/service/datastore" 32 "go.chromium.org/luci/server/tq" 33 34 "go.chromium.org/luci/cv/internal/changelist" 35 "go.chromium.org/luci/cv/internal/common" 36 "go.chromium.org/luci/cv/internal/configs/prjcfg" 37 "go.chromium.org/luci/cv/internal/gerrit" 38 "go.chromium.org/luci/cv/internal/gerrit/gobmap" 39 ) 40 41 const taskClassID = "poll-gerrit" 42 43 // pmNotifier encapsulates interaction with Project Manager by the Poller. 44 // 45 // In production, implemented by prjmanager.Notifier. 46 type pmNotifier interface { 47 NotifyCLsUpdated(ctx context.Context, luciProject string, cls *changelist.CLUpdatedEvents) error 48 } 49 50 // CLUpdater encapsulates interaction with Gerrit CL Updater by the Poller. 51 type CLUpdater interface { 52 Schedule(context.Context, *changelist.UpdateCLTask) error 53 ScheduleDelayed(context.Context, *changelist.UpdateCLTask, time.Duration) error 54 } 55 56 // Poller polls Gerrit to discover new CLs and modifications of the existing 57 // ones. 58 type Poller struct { 59 tqd *tq.Dispatcher 60 gFactory gerrit.Factory 61 clUpdater CLUpdater 62 pm pmNotifier 63 } 64 65 // New creates a new Poller, registering it in the given TQ dispatcher. 66 func New(tqd *tq.Dispatcher, g gerrit.Factory, clUpdater CLUpdater, pm pmNotifier) *Poller { 67 p := &Poller{tqd, g, clUpdater, pm} 68 tqd.RegisterTaskClass(tq.TaskClass{ 69 ID: taskClassID, 70 Prototype: &PollGerritTask{}, 71 Queue: "poll-gerrit", 72 Quiet: true, 73 QuietOnError: true, 74 Kind: tq.NonTransactional, 75 Handler: func(ctx context.Context, payload proto.Message) error { 76 task := payload.(*PollGerritTask) 77 ctx = logging.SetField(ctx, "project", task.GetLuciProject()) 78 err := p.poll(ctx, task.GetLuciProject(), task.GetEta().AsTime()) 79 return common.TQIfy{ 80 KnownRetry: []error{errConcurrentStateUpdate}, 81 }.Error(ctx, err) 82 }, 83 }) 84 return p 85 } 86 87 // Poke schedules the next poll via task queue. 88 // 89 // Under perfect operation, this is redundant, but not harmful. 90 // Given bugs or imperfect operation, this ensures poller continues operating. 91 // 92 // Must not be called inside a datastore transaction. 93 func (p *Poller) Poke(ctx context.Context, luciProject string) error { 94 if datastore.CurrentTransaction(ctx) != nil { 95 panic("must be called outside of transaction context") 96 } 97 return p.schedule(ctx, luciProject, time.Time{}) 98 } 99 100 var errConcurrentStateUpdate = errors.New("concurrent change to poller state", transient.Tag) 101 102 // poll executes the next poll with the latest known to poller config. 103 // 104 // For each discovered CL, enqueues a task for CL updater to refresh CL state. 105 // Automatically enqueues a new task to perform next poll. 106 func (p *Poller) poll(ctx context.Context, luciProject string, eta time.Time) error { 107 if delay := clock.Now(ctx).Sub(eta); delay > maxAcceptableDelay { 108 logging.Warningf(ctx, "poll %s arrived %s late; scheduling next poll instead", eta, delay) 109 return p.schedule(ctx, luciProject, time.Time{}) 110 } 111 // TODO(tandrii): avoid concurrent polling of the same project via cheap 112 // best-effort locking in Redis. 113 meta, err := prjcfg.GetLatestMeta(ctx, luciProject) 114 switch { 115 case err != nil: 116 case (meta.Status == prjcfg.StatusDisabled || meta.Status == prjcfg.StatusNotExists): 117 if err := gobmap.Update(ctx, &meta, nil); err != nil { 118 return err 119 } 120 if err = datastore.Delete(ctx, &State{LuciProject: luciProject}); err != nil { 121 return errors.Annotate(err, "failed to disable poller for %q", luciProject).Err() 122 } 123 return nil 124 case meta.Status == prjcfg.StatusEnabled: 125 err = p.pollWithConfig(ctx, luciProject, meta) 126 default: 127 panic(fmt.Errorf("unknown project config status: %d", meta.Status)) 128 } 129 130 switch { 131 case err == nil: 132 return p.schedule(ctx, luciProject, eta) 133 case clock.Now(ctx).After(eta.Add(pollInterval - time.Second)): 134 // Time to finish this task despite error, and trigger a new one. 135 err = errors.Annotate(err, "failed to do poll %s for %q", eta, luciProject).Err() 136 common.LogError(ctx, err, errConcurrentStateUpdate) 137 return p.schedule(ctx, luciProject, eta) 138 default: 139 return err 140 } 141 } 142 143 // pollInterval is an approximate and merely best-effort average interval 144 // between polls of a single project. 145 // 146 // TODO(tandrii): revisit interval and error handling in pollWithConfig once CV 147 // subscribes to Gerrit PubSub. 148 const pollInterval = 10 * time.Second 149 150 // maxAcceptableDelay prevents polls which arrive too late from doing actual 151 // polling. 152 // 153 // maxAcceptableDelay / pollInterval effectively limits # concurrent polls of 154 // the same project that may happen due to task retries, delays, and queue 155 // throttling. 156 // 157 // Do not set too low, as this may prevent actual polling from happening at all 158 // if the poll TQ is overloaded. 159 const maxAcceptableDelay = 6 * pollInterval 160 161 // schedule schedules the future poll. 162 // 163 // Optional `after` can be set to the current task's ETA to ensure that next 164 // poll's task isn't de-duplicated with the current task. 165 func (p *Poller) schedule(ctx context.Context, luciProject string, after time.Time) error { 166 // Desired properties: 167 // * for a single LUCI project, minimize p99 of actually observed poll 168 // intervals. 169 // * keep polling load on Gerrit at `1/pollInterval` per LUCI project; 170 // * avoid bursts of polls on Gerrit, i.e. distribute polls of diff projects 171 // throughout `pollInterval`. 172 // 173 // So, 174 // * de-duplicate poll tasks to 1 task per LUCI project per pollInterval; 175 // * vary epoch time, from which increments of pollInterval are done, by 176 // LUCI project. See projectOffset(). 177 if now := clock.Now(ctx); after.IsZero() || now.After(after) { 178 after = now 179 } 180 offset := common.DistributeOffset(pollInterval, "gerrit-poller", luciProject) 181 offset = offset.Truncate(time.Millisecond) // more readable logs 182 eta := after.UTC().Truncate(pollInterval).Add(offset) 183 for !eta.After(after) { 184 eta = eta.Add(pollInterval) 185 } 186 task := &tq.Task{ 187 Title: luciProject, 188 Payload: &PollGerritTask{ 189 LuciProject: luciProject, 190 Eta: timestamppb.New(eta), 191 }, 192 ETA: eta, 193 DeduplicationKey: fmt.Sprintf("%s:%d", luciProject, eta.UnixNano()), 194 } 195 if err := p.tqd.AddTask(ctx, task); err != nil { 196 return err 197 } 198 return nil 199 } 200 201 // State persists poller's State in datastore. 202 // 203 // State is exported for exposure via Admin API for debugging/observation. 204 // It must not be used elsewhere. 205 type State struct { 206 _kind string `gae:"$kind,GerritPoller"` 207 208 // Project is the name of the LUCI Project for which poller is working. 209 LuciProject string `gae:"$id"` 210 // UpdateTime is the timestamp when this state was last updated. 211 UpdateTime time.Time `gae:",noindex"` 212 // EVersion is the latest version number of the state. 213 // 214 // It increments by 1 every time state is updated either due to new project config 215 // being updated OR after each successful poll. 216 EVersion int64 `gae:",noindex"` 217 // ConfigHash defines which Config version was last worked on. 218 ConfigHash string `gae:",noindex"` 219 // QueryStates tracks states of individual queries. 220 // 221 // Most LUCI projects will run just 1 query per Gerrit host. 222 // But, if a LUCI project is watching many Gerrit projects (a.k.a. Git repos), 223 // then the Gerrit projects may be split between several queries. 224 // 225 // TODO(tandrii): rename the datastore property name. 226 QueryStates *QueryStates `gae:"SubPollers"` 227 } 228 229 // pollWithConfig performs the poll and if necessary updates to newest project config. 230 func (p *Poller) pollWithConfig(ctx context.Context, luciProject string, meta prjcfg.Meta) error { 231 stateBefore := State{LuciProject: luciProject} 232 switch err := datastore.Get(ctx, &stateBefore); { 233 case err != nil && err != datastore.ErrNoSuchEntity: 234 return errors.Annotate(err, "failed to get poller state for %q", luciProject).Tag(transient.Tag).Err() 235 case err == datastore.ErrNoSuchEntity || stateBefore.ConfigHash != meta.Hash(): 236 if err = p.updateConfig(ctx, &stateBefore, meta); err != nil { 237 return err 238 } 239 } 240 241 // Use WorkPool to limit concurrency, but keep track of errors per query 242 // ourselves because WorkPool doesn't guarantee specific errors order. 243 errs := make(errors.MultiError, len(stateBefore.QueryStates.GetStates())) 244 err := parallel.WorkPool(10, func(work chan<- func() error) { 245 for i, qs := range stateBefore.QueryStates.GetStates() { 246 i, qs := i, qs 247 work <- func() error { 248 ctx := logging.SetField(ctx, "gHost", qs.GetHost()) 249 err := p.doOneQuery(ctx, luciProject, qs) 250 errs[i] = errors.Annotate(err, "query %s", qs).Err() 251 return nil 252 } 253 } 254 }) 255 if err != nil { 256 panic(err) 257 } 258 // Save state regardless of failure of individual queries. 259 if saveErr := save(ctx, &stateBefore); saveErr != nil { 260 // saving error supersedes per-query errors. 261 return saveErr 262 } 263 err = common.MostSevereError(errs) 264 switch n, first := errs.Summary(); { 265 case n == len(errs): 266 return errors.Annotate(first, "no progress on any poller, first error").Err() 267 case err != nil: 268 // Some progress. We'll retry during next poll. 269 // TODO(tandrii): revisit this logic once CV subscribes to PubSub and makes 270 // polling much less frequent. 271 err = errors.Annotate(err, "failed %d/%d queries for %q. The most severe error:", n, len(errs), luciProject).Err() 272 common.LogError(ctx, err) 273 } 274 return nil 275 } 276 277 // updateConfig fetches latest config, updates gobmap and poller's own state. 278 func (p *Poller) updateConfig(ctx context.Context, s *State, meta prjcfg.Meta) error { 279 s.ConfigHash = meta.Hash() 280 cgs, err := meta.GetConfigGroups(ctx) 281 if err != nil { 282 return err 283 } 284 if err := gobmap.Update(ctx, &meta, cgs); err != nil { 285 return err 286 } 287 proposed := partitionConfig(cgs) 288 toUse, discarded := reuseIfPossible(s.QueryStates.GetStates(), proposed) 289 for _, d := range discarded { 290 if err := p.notifyOnUnmatchedCLs( 291 ctx, s.LuciProject, d.GetHost(), d.Changes, 292 changelist.UpdateCLTask_UPDATE_CONFIG); err != nil { 293 return err 294 } 295 } 296 s.QueryStates = &QueryStates{States: toUse} 297 return nil 298 } 299 300 // save saves the state of poller after the poll. 301 func save(ctx context.Context, s *State) error { 302 var innerErr error 303 var copied State 304 err := datastore.RunInTransaction(ctx, func(ctx context.Context) (err error) { 305 defer func() { innerErr = err }() 306 latest := State{LuciProject: s.LuciProject} 307 switch err = datastore.Get(ctx, &latest); { 308 case err == datastore.ErrNoSuchEntity: 309 if s.EVersion > 0 { 310 // At the beginning of the poll, we read an existing state. 311 // So, there was a concurrent deletion. 312 return errors.Reason("poller state was unexpectedly missing").Err() 313 } 314 // Then, we'll create it. 315 case err != nil: 316 return errors.Annotate(err, "failed to get poller state").Tag(transient.Tag).Err() 317 case latest.EVersion != s.EVersion: 318 return errConcurrentStateUpdate 319 } 320 copied = *s 321 copied.EVersion++ 322 copied.UpdateTime = clock.Now(ctx).UTC() 323 if err = datastore.Put(ctx, &copied); err != nil { 324 return errors.Annotate(err, "failed to save poller state").Tag(transient.Tag).Err() 325 } 326 return nil 327 }, nil) 328 329 switch { 330 case innerErr != nil: 331 return innerErr 332 case err != nil: 333 return errors.Annotate(err, "failed to save poller state").Tag(transient.Tag).Err() 334 default: 335 *s = copied 336 return nil 337 } 338 }