go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/gerrit/poller/poller.go (about)

     1  // Copyright 2020 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package poller implements stateful Gerrit polling.
    16  package poller
    17  
    18  import (
    19  	"context"
    20  	"fmt"
    21  	"time"
    22  
    23  	"google.golang.org/protobuf/proto"
    24  	"google.golang.org/protobuf/types/known/timestamppb"
    25  
    26  	"go.chromium.org/luci/common/clock"
    27  	"go.chromium.org/luci/common/errors"
    28  	"go.chromium.org/luci/common/logging"
    29  	"go.chromium.org/luci/common/retry/transient"
    30  	"go.chromium.org/luci/common/sync/parallel"
    31  	"go.chromium.org/luci/gae/service/datastore"
    32  	"go.chromium.org/luci/server/tq"
    33  
    34  	"go.chromium.org/luci/cv/internal/changelist"
    35  	"go.chromium.org/luci/cv/internal/common"
    36  	"go.chromium.org/luci/cv/internal/configs/prjcfg"
    37  	"go.chromium.org/luci/cv/internal/gerrit"
    38  	"go.chromium.org/luci/cv/internal/gerrit/gobmap"
    39  )
    40  
    41  const taskClassID = "poll-gerrit"
    42  
    43  // pmNotifier encapsulates interaction with Project Manager by the Poller.
    44  //
    45  // In production, implemented by prjmanager.Notifier.
    46  type pmNotifier interface {
    47  	NotifyCLsUpdated(ctx context.Context, luciProject string, cls *changelist.CLUpdatedEvents) error
    48  }
    49  
    50  // CLUpdater encapsulates interaction with Gerrit CL Updater by the Poller.
    51  type CLUpdater interface {
    52  	Schedule(context.Context, *changelist.UpdateCLTask) error
    53  	ScheduleDelayed(context.Context, *changelist.UpdateCLTask, time.Duration) error
    54  }
    55  
    56  // Poller polls Gerrit to discover new CLs and modifications of the existing
    57  // ones.
    58  type Poller struct {
    59  	tqd       *tq.Dispatcher
    60  	gFactory  gerrit.Factory
    61  	clUpdater CLUpdater
    62  	pm        pmNotifier
    63  }
    64  
    65  // New creates a new Poller, registering it in the given TQ dispatcher.
    66  func New(tqd *tq.Dispatcher, g gerrit.Factory, clUpdater CLUpdater, pm pmNotifier) *Poller {
    67  	p := &Poller{tqd, g, clUpdater, pm}
    68  	tqd.RegisterTaskClass(tq.TaskClass{
    69  		ID:           taskClassID,
    70  		Prototype:    &PollGerritTask{},
    71  		Queue:        "poll-gerrit",
    72  		Quiet:        true,
    73  		QuietOnError: true,
    74  		Kind:         tq.NonTransactional,
    75  		Handler: func(ctx context.Context, payload proto.Message) error {
    76  			task := payload.(*PollGerritTask)
    77  			ctx = logging.SetField(ctx, "project", task.GetLuciProject())
    78  			err := p.poll(ctx, task.GetLuciProject(), task.GetEta().AsTime())
    79  			return common.TQIfy{
    80  				KnownRetry: []error{errConcurrentStateUpdate},
    81  			}.Error(ctx, err)
    82  		},
    83  	})
    84  	return p
    85  }
    86  
    87  // Poke schedules the next poll via task queue.
    88  //
    89  // Under perfect operation, this is redundant, but not harmful.
    90  // Given bugs or imperfect operation, this ensures poller continues operating.
    91  //
    92  // Must not be called inside a datastore transaction.
    93  func (p *Poller) Poke(ctx context.Context, luciProject string) error {
    94  	if datastore.CurrentTransaction(ctx) != nil {
    95  		panic("must be called outside of transaction context")
    96  	}
    97  	return p.schedule(ctx, luciProject, time.Time{})
    98  }
    99  
   100  var errConcurrentStateUpdate = errors.New("concurrent change to poller state", transient.Tag)
   101  
   102  // poll executes the next poll with the latest known to poller config.
   103  //
   104  // For each discovered CL, enqueues a task for CL updater to refresh CL state.
   105  // Automatically enqueues a new task to perform next poll.
   106  func (p *Poller) poll(ctx context.Context, luciProject string, eta time.Time) error {
   107  	if delay := clock.Now(ctx).Sub(eta); delay > maxAcceptableDelay {
   108  		logging.Warningf(ctx, "poll %s arrived %s late; scheduling next poll instead", eta, delay)
   109  		return p.schedule(ctx, luciProject, time.Time{})
   110  	}
   111  	// TODO(tandrii): avoid concurrent polling of the same project via cheap
   112  	// best-effort locking in Redis.
   113  	meta, err := prjcfg.GetLatestMeta(ctx, luciProject)
   114  	switch {
   115  	case err != nil:
   116  	case (meta.Status == prjcfg.StatusDisabled || meta.Status == prjcfg.StatusNotExists):
   117  		if err := gobmap.Update(ctx, &meta, nil); err != nil {
   118  			return err
   119  		}
   120  		if err = datastore.Delete(ctx, &State{LuciProject: luciProject}); err != nil {
   121  			return errors.Annotate(err, "failed to disable poller for %q", luciProject).Err()
   122  		}
   123  		return nil
   124  	case meta.Status == prjcfg.StatusEnabled:
   125  		err = p.pollWithConfig(ctx, luciProject, meta)
   126  	default:
   127  		panic(fmt.Errorf("unknown project config status: %d", meta.Status))
   128  	}
   129  
   130  	switch {
   131  	case err == nil:
   132  		return p.schedule(ctx, luciProject, eta)
   133  	case clock.Now(ctx).After(eta.Add(pollInterval - time.Second)):
   134  		// Time to finish this task despite error, and trigger a new one.
   135  		err = errors.Annotate(err, "failed to do poll %s for %q", eta, luciProject).Err()
   136  		common.LogError(ctx, err, errConcurrentStateUpdate)
   137  		return p.schedule(ctx, luciProject, eta)
   138  	default:
   139  		return err
   140  	}
   141  }
   142  
   143  // pollInterval is an approximate and merely best-effort average interval
   144  // between polls of a single project.
   145  //
   146  // TODO(tandrii): revisit interval and error handling in pollWithConfig once CV
   147  // subscribes to Gerrit PubSub.
   148  const pollInterval = 10 * time.Second
   149  
   150  // maxAcceptableDelay prevents polls which arrive too late from doing actual
   151  // polling.
   152  //
   153  // maxAcceptableDelay / pollInterval effectively limits # concurrent polls of
   154  // the same project that may happen due to task retries, delays, and queue
   155  // throttling.
   156  //
   157  // Do not set too low, as this may prevent actual polling from happening at all
   158  // if the poll TQ is overloaded.
   159  const maxAcceptableDelay = 6 * pollInterval
   160  
   161  // schedule schedules the future poll.
   162  //
   163  // Optional `after` can be set to the current task's ETA to ensure that next
   164  // poll's task isn't de-duplicated with the current task.
   165  func (p *Poller) schedule(ctx context.Context, luciProject string, after time.Time) error {
   166  	// Desired properties:
   167  	//   * for a single LUCI project, minimize p99 of actually observed poll
   168  	//     intervals.
   169  	//   * keep polling load on Gerrit at `1/pollInterval` per LUCI project;
   170  	//   * avoid bursts of polls on Gerrit, i.e. distribute polls of diff projects
   171  	//     throughout `pollInterval`.
   172  	//
   173  	// So,
   174  	//   * de-duplicate poll tasks to 1 task per LUCI project per pollInterval;
   175  	//   * vary epoch time, from which increments of pollInterval are done, by
   176  	//     LUCI project. See projectOffset().
   177  	if now := clock.Now(ctx); after.IsZero() || now.After(after) {
   178  		after = now
   179  	}
   180  	offset := common.DistributeOffset(pollInterval, "gerrit-poller", luciProject)
   181  	offset = offset.Truncate(time.Millisecond) // more readable logs
   182  	eta := after.UTC().Truncate(pollInterval).Add(offset)
   183  	for !eta.After(after) {
   184  		eta = eta.Add(pollInterval)
   185  	}
   186  	task := &tq.Task{
   187  		Title: luciProject,
   188  		Payload: &PollGerritTask{
   189  			LuciProject: luciProject,
   190  			Eta:         timestamppb.New(eta),
   191  		},
   192  		ETA:              eta,
   193  		DeduplicationKey: fmt.Sprintf("%s:%d", luciProject, eta.UnixNano()),
   194  	}
   195  	if err := p.tqd.AddTask(ctx, task); err != nil {
   196  		return err
   197  	}
   198  	return nil
   199  }
   200  
   201  // State persists poller's State in datastore.
   202  //
   203  // State is exported for exposure via Admin API for debugging/observation.
   204  // It must not be used elsewhere.
   205  type State struct {
   206  	_kind string `gae:"$kind,GerritPoller"`
   207  
   208  	// Project is the name of the LUCI Project for which poller is working.
   209  	LuciProject string `gae:"$id"`
   210  	// UpdateTime is the timestamp when this state was last updated.
   211  	UpdateTime time.Time `gae:",noindex"`
   212  	// EVersion is the latest version number of the state.
   213  	//
   214  	// It increments by 1 every time state is updated either due to new project config
   215  	// being updated OR after each successful poll.
   216  	EVersion int64 `gae:",noindex"`
   217  	// ConfigHash defines which Config version was last worked on.
   218  	ConfigHash string `gae:",noindex"`
   219  	// QueryStates tracks states of individual queries.
   220  	//
   221  	// Most LUCI projects will run just 1 query per Gerrit host.
   222  	// But, if a LUCI project is watching many Gerrit projects (a.k.a. Git repos),
   223  	// then the Gerrit projects may be split between several queries.
   224  	//
   225  	// TODO(tandrii): rename the datastore property name.
   226  	QueryStates *QueryStates `gae:"SubPollers"`
   227  }
   228  
   229  // pollWithConfig performs the poll and if necessary updates to newest project config.
   230  func (p *Poller) pollWithConfig(ctx context.Context, luciProject string, meta prjcfg.Meta) error {
   231  	stateBefore := State{LuciProject: luciProject}
   232  	switch err := datastore.Get(ctx, &stateBefore); {
   233  	case err != nil && err != datastore.ErrNoSuchEntity:
   234  		return errors.Annotate(err, "failed to get poller state for %q", luciProject).Tag(transient.Tag).Err()
   235  	case err == datastore.ErrNoSuchEntity || stateBefore.ConfigHash != meta.Hash():
   236  		if err = p.updateConfig(ctx, &stateBefore, meta); err != nil {
   237  			return err
   238  		}
   239  	}
   240  
   241  	// Use WorkPool to limit concurrency, but keep track of errors per query
   242  	// ourselves because WorkPool doesn't guarantee specific errors order.
   243  	errs := make(errors.MultiError, len(stateBefore.QueryStates.GetStates()))
   244  	err := parallel.WorkPool(10, func(work chan<- func() error) {
   245  		for i, qs := range stateBefore.QueryStates.GetStates() {
   246  			i, qs := i, qs
   247  			work <- func() error {
   248  				ctx := logging.SetField(ctx, "gHost", qs.GetHost())
   249  				err := p.doOneQuery(ctx, luciProject, qs)
   250  				errs[i] = errors.Annotate(err, "query %s", qs).Err()
   251  				return nil
   252  			}
   253  		}
   254  	})
   255  	if err != nil {
   256  		panic(err)
   257  	}
   258  	// Save state regardless of failure of individual queries.
   259  	if saveErr := save(ctx, &stateBefore); saveErr != nil {
   260  		// saving error supersedes per-query errors.
   261  		return saveErr
   262  	}
   263  	err = common.MostSevereError(errs)
   264  	switch n, first := errs.Summary(); {
   265  	case n == len(errs):
   266  		return errors.Annotate(first, "no progress on any poller, first error").Err()
   267  	case err != nil:
   268  		// Some progress. We'll retry during next poll.
   269  		// TODO(tandrii): revisit this logic once CV subscribes to PubSub and makes
   270  		// polling much less frequent.
   271  		err = errors.Annotate(err, "failed %d/%d queries for %q. The most severe error:", n, len(errs), luciProject).Err()
   272  		common.LogError(ctx, err)
   273  	}
   274  	return nil
   275  }
   276  
   277  // updateConfig fetches latest config, updates gobmap and poller's own state.
   278  func (p *Poller) updateConfig(ctx context.Context, s *State, meta prjcfg.Meta) error {
   279  	s.ConfigHash = meta.Hash()
   280  	cgs, err := meta.GetConfigGroups(ctx)
   281  	if err != nil {
   282  		return err
   283  	}
   284  	if err := gobmap.Update(ctx, &meta, cgs); err != nil {
   285  		return err
   286  	}
   287  	proposed := partitionConfig(cgs)
   288  	toUse, discarded := reuseIfPossible(s.QueryStates.GetStates(), proposed)
   289  	for _, d := range discarded {
   290  		if err := p.notifyOnUnmatchedCLs(
   291  			ctx, s.LuciProject, d.GetHost(), d.Changes,
   292  			changelist.UpdateCLTask_UPDATE_CONFIG); err != nil {
   293  			return err
   294  		}
   295  	}
   296  	s.QueryStates = &QueryStates{States: toUse}
   297  	return nil
   298  }
   299  
   300  // save saves the state of poller after the poll.
   301  func save(ctx context.Context, s *State) error {
   302  	var innerErr error
   303  	var copied State
   304  	err := datastore.RunInTransaction(ctx, func(ctx context.Context) (err error) {
   305  		defer func() { innerErr = err }()
   306  		latest := State{LuciProject: s.LuciProject}
   307  		switch err = datastore.Get(ctx, &latest); {
   308  		case err == datastore.ErrNoSuchEntity:
   309  			if s.EVersion > 0 {
   310  				// At the beginning of the poll, we read an existing state.
   311  				// So, there was a concurrent deletion.
   312  				return errors.Reason("poller state was unexpectedly missing").Err()
   313  			}
   314  			// Then, we'll create it.
   315  		case err != nil:
   316  			return errors.Annotate(err, "failed to get poller state").Tag(transient.Tag).Err()
   317  		case latest.EVersion != s.EVersion:
   318  			return errConcurrentStateUpdate
   319  		}
   320  		copied = *s
   321  		copied.EVersion++
   322  		copied.UpdateTime = clock.Now(ctx).UTC()
   323  		if err = datastore.Put(ctx, &copied); err != nil {
   324  			return errors.Annotate(err, "failed to save poller state").Tag(transient.Tag).Err()
   325  		}
   326  		return nil
   327  	}, nil)
   328  
   329  	switch {
   330  	case innerErr != nil:
   331  		return innerErr
   332  	case err != nil:
   333  		return errors.Annotate(err, "failed to save poller state").Tag(transient.Tag).Err()
   334  	default:
   335  		*s = copied
   336  		return nil
   337  	}
   338  }