go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/prjmanager/manager/manager.go (about)

     1  // Copyright 2020 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package manager
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"time"
    21  
    22  	"google.golang.org/protobuf/proto"
    23  
    24  	"go.chromium.org/luci/common/clock"
    25  	"go.chromium.org/luci/common/data/stringset"
    26  	"go.chromium.org/luci/common/errors"
    27  	"go.chromium.org/luci/common/logging"
    28  	"go.chromium.org/luci/common/retry/transient"
    29  	"go.chromium.org/luci/gae/filter/txndefer"
    30  	"go.chromium.org/luci/gae/service/datastore"
    31  
    32  	"go.chromium.org/luci/cv/internal/changelist"
    33  	"go.chromium.org/luci/cv/internal/common"
    34  	"go.chromium.org/luci/cv/internal/common/eventbox"
    35  	"go.chromium.org/luci/cv/internal/gerrit"
    36  	"go.chromium.org/luci/cv/internal/gerrit/poller"
    37  	"go.chromium.org/luci/cv/internal/prjmanager"
    38  	"go.chromium.org/luci/cv/internal/prjmanager/clpurger"
    39  	"go.chromium.org/luci/cv/internal/prjmanager/cltriggerer"
    40  	"go.chromium.org/luci/cv/internal/prjmanager/prjpb"
    41  	"go.chromium.org/luci/cv/internal/prjmanager/state"
    42  	"go.chromium.org/luci/cv/internal/prjmanager/triager"
    43  	"go.chromium.org/luci/cv/internal/run"
    44  	"go.chromium.org/luci/cv/internal/run/runcreator"
    45  	"go.chromium.org/luci/cv/internal/tracing"
    46  )
    47  
    48  const (
    49  	// maxEventsPerBatch limits the number of incoming events the PM will process at
    50  	// once.
    51  	//
    52  	// This shouldn't be hit in practice under normal operation. This is chosen such
    53  	// that PM can read these events and make some progress in 1 minute.
    54  	maxEventsPerBatch = 10000
    55  
    56  	// logProjectStateFrequency forces saving ProjectLog entity iff
    57  	// Project.EVersion is divisible by logProjectStateFrequency.
    58  	//
    59  	// In practice, the busiest projects sustain at most ~1 QPS of updates.
    60  	// Thus, value of 60 limits ProjectLog to at most 1/minute or 1.5k/day.
    61  	logProjectStateFrequency = 60
    62  )
    63  
    64  var errTaskArrivedTooLate = errors.New("task arrived too late")
    65  
    66  // ProjectManager implements managing projects.
    67  type ProjectManager struct {
    68  	tasksBinding prjpb.TasksBinding
    69  	handler      state.Handler
    70  }
    71  
    72  // New creates a new ProjectManager and registers it for handling tasks created
    73  // by the given TQ Notifier.
    74  func New(n *prjmanager.Notifier, rn state.RunNotifier, c *changelist.Mutator, g gerrit.Factory, u *changelist.Updater) *ProjectManager {
    75  	pm := &ProjectManager{
    76  		tasksBinding: n.TasksBinding,
    77  		handler: state.Handler{
    78  			CLMutator:       c,
    79  			PMNotifier:      n,
    80  			RunNotifier:     rn,
    81  			CLPurger:        clpurger.New(n, g, u, c),
    82  			CLTriggerer:     cltriggerer.New(n, g, u, c),
    83  			CLPoller:        poller.New(n.TasksBinding.TQDispatcher, g, u, n),
    84  			ComponentTriage: triager.Triage,
    85  		},
    86  	}
    87  	n.TasksBinding.ManageProject.AttachHandler(
    88  		func(ctx context.Context, payload proto.Message) error {
    89  			task := payload.(*prjpb.ManageProjectTask)
    90  			ctx = logging.SetField(ctx, "project", task.GetLuciProject())
    91  			err := pm.manageProject(ctx, task.GetLuciProject(), task.GetEta().AsTime())
    92  			return common.TQIfy{
    93  				KnownIgnore:     []error{errTaskArrivedTooLate},
    94  				KnownIgnoreTags: []errors.BoolTag{common.DSContentionTag},
    95  				KnownRetryTags:  []errors.BoolTag{runcreator.StateChangedTag},
    96  			}.Error(ctx, err)
    97  		},
    98  	)
    99  
   100  	n.TasksBinding.KickManageProject.AttachHandler(
   101  		func(ctx context.Context, payload proto.Message) error {
   102  			task := payload.(*prjpb.KickManageProjectTask)
   103  			var eta time.Time
   104  			if t := task.GetEta(); t != nil {
   105  				eta = t.AsTime()
   106  			}
   107  			err := n.TasksBinding.Dispatch(ctx, task.GetLuciProject(), eta)
   108  			return common.TQifyError(ctx, err)
   109  		},
   110  	)
   111  	return pm
   112  }
   113  
   114  func (pm *ProjectManager) manageProject(ctx context.Context, luciProject string, taskETA time.Time) error {
   115  	retryViaNewTask := false
   116  	var processErr error
   117  	if delay := clock.Now(ctx).Sub(taskETA); delay > prjpb.MaxAcceptableDelay {
   118  		logging.Warningf(ctx, "task %s arrived %s late; scheduling next task instead", taskETA, delay)
   119  		retryViaNewTask = true
   120  		processErr = errTaskArrivedTooLate
   121  	} else {
   122  		processErr = pm.processBatch(ctx, luciProject)
   123  		if common.DSContentionTag.In(processErr) {
   124  			logging.Warningf(ctx, "Datastore contention; scheduling next task instead")
   125  			retryViaNewTask = true
   126  		}
   127  	}
   128  
   129  	if retryViaNewTask {
   130  		// Scheduling new task reduces probability of concurrent tasks in extreme
   131  		// events.
   132  		if err := pm.tasksBinding.Dispatch(ctx, luciProject, time.Time{}); err != nil {
   133  			// This should be rare and retry is the best we can do.
   134  			return err
   135  		}
   136  	}
   137  	return processErr
   138  }
   139  
   140  func (pm *ProjectManager) processBatch(ctx context.Context, luciProject string) error {
   141  	proc := &pmProcessor{
   142  		luciProject: luciProject,
   143  		handler:     &pm.handler,
   144  	}
   145  	recipient := prjmanager.EventboxRecipient(ctx, luciProject)
   146  	postProcessFns, err := eventbox.ProcessBatch(ctx, recipient, proc, maxEventsPerBatch)
   147  	if err != nil {
   148  		return err
   149  	}
   150  	if l := len(postProcessFns); l > 0 {
   151  		panic(fmt.Errorf("postProcessFns is not supported in PM; got %d", l))
   152  	}
   153  	return nil
   154  }
   155  
   156  // pmProcessor implements eventbox.Processor.
   157  type pmProcessor struct {
   158  	luciProject string
   159  	handler     *state.Handler
   160  	// loadedPState is set by LoadState and read by SaveState.
   161  	loadedPState *prjpb.PState
   162  }
   163  
   164  // LoadState is called to load the state before a transaction.
   165  func (proc *pmProcessor) LoadState(ctx context.Context) (eventbox.State, eventbox.EVersion, error) {
   166  	s := &state.State{}
   167  	switch p, err := prjmanager.Load(ctx, proc.luciProject); {
   168  	case err != nil:
   169  		return nil, 0, err
   170  	case p == nil:
   171  		s.PB = &prjpb.PState{LuciProject: proc.luciProject}
   172  		return s, 0, nil
   173  	default:
   174  		p.State.LuciProject = proc.luciProject
   175  		proc.loadedPState = p.State
   176  		s.PB = p.State
   177  		return s, eventbox.EVersion(p.EVersion), nil
   178  	}
   179  }
   180  
   181  // PrepareMutation is called before a transaction to compute transitions.
   182  //
   183  // All actions that must be done atomically with updating state must be
   184  // encapsulated inside Transition.SideEffectFn callback.
   185  func (proc *pmProcessor) PrepareMutation(ctx context.Context, events eventbox.Events, s eventbox.State) (ts []eventbox.Transition, noops eventbox.Events, err error) {
   186  	ctx, span := tracing.Start(ctx, "go.chromium.org/luci/cv/internal/prjmanager/impl/Mutate")
   187  	defer func() { tracing.End(span, err) }()
   188  
   189  	tr := &triageResult{}
   190  	for _, e := range events {
   191  		tr.triage(ctx, e)
   192  	}
   193  	tr.removeCLUpdateNoops()
   194  
   195  	ts, err = proc.mutate(ctx, tr, s.(*state.State))
   196  	return ts, tr.noops, err
   197  }
   198  
   199  // FetchEVersion is called at the beginning of a transaction.
   200  //
   201  // The returned EVersion is compared against the one associated with a state
   202  // loaded via GetState. If different, the transaction is aborted and new state
   203  // isn't saved.
   204  func (proc *pmProcessor) FetchEVersion(ctx context.Context) (eventbox.EVersion, error) {
   205  	p := &prjmanager.Project{ID: proc.luciProject}
   206  	switch err := datastore.Get(ctx, p); {
   207  	case err == datastore.ErrNoSuchEntity:
   208  		return 0, nil
   209  	case err != nil:
   210  		return 0, errors.Annotate(err, "failed to get %q", proc.luciProject).Tag(transient.Tag).Err()
   211  	default:
   212  		return eventbox.EVersion(p.EVersion), nil
   213  	}
   214  }
   215  
   216  // SaveState is called in a transaction to save the state if it has changed.
   217  //
   218  // The passed EVersion is the incremented value of EVersion of what GetState
   219  // returned before.
   220  func (proc *pmProcessor) SaveState(ctx context.Context, st eventbox.State, ev eventbox.EVersion) error {
   221  	s := st.(*state.State)
   222  	// Erase PB.LuciProject as it's already stored as Project{ID:...}.
   223  	s.PB.LuciProject = ""
   224  
   225  	new := &prjmanager.Project{
   226  		ID:         proc.luciProject,
   227  		EVersion:   int64(ev),
   228  		UpdateTime: datastore.RoundTime(clock.Now(ctx).UTC()),
   229  		State:      s.PB,
   230  	}
   231  	entities := make([]any, 1, 3)
   232  	entities[0] = new
   233  
   234  	old := proc.loadedPState
   235  	if s.PB.GetConfigHash() != old.GetConfigHash() || s.PB.GetStatus() != old.GetStatus() {
   236  		entities = append(entities, &prjmanager.ProjectStateOffload{
   237  			Project:    datastore.MakeKey(ctx, prjmanager.ProjectKind, proc.luciProject),
   238  			Status:     s.PB.GetStatus(),
   239  			ConfigHash: s.PB.GetConfigHash(),
   240  			UpdateTime: clock.Now(ctx).UTC(),
   241  		})
   242  	}
   243  
   244  	switch reasons := s.LogReasons; {
   245  	case new.EVersion%logProjectStateFrequency == 0:
   246  		reasons = append(s.LogReasons, prjpb.LogReason_FYI_PERIODIC)
   247  		fallthrough
   248  	case len(reasons) > 0:
   249  		deduped := prjpb.SortAndDedupeLogReasons(reasons)
   250  		txndefer.Defer(ctx, func(ctx context.Context) {
   251  			logging.Debugf(ctx, "Saved ProjectLog @ %d due to %s", new.EVersion, prjpb.FormatLogReasons(deduped))
   252  		})
   253  		entities = append(entities, &prjmanager.ProjectLog{
   254  			Project:    datastore.MakeKey(ctx, prjmanager.ProjectKind, proc.luciProject),
   255  			EVersion:   new.EVersion,
   256  			Status:     s.PB.GetStatus(),
   257  			ConfigHash: s.PB.GetConfigHash(),
   258  			State:      new.State,
   259  			UpdateTime: new.UpdateTime,
   260  			Reasons:    deduped,
   261  		})
   262  	}
   263  
   264  	if err := datastore.Put(ctx, entities...); err != nil {
   265  		return errors.Annotate(err, "failed to put Project").Tag(transient.Tag).Err()
   266  	}
   267  	return nil
   268  }
   269  
   270  // triageResult is the result of the triage of the incoming events.
   271  type triageResult struct {
   272  	// Noops are events that can be safely deleted before a transaction
   273  	// because another semantically **superseding** event will remain in
   274  	// eventbox.
   275  	//
   276  	// Safety note: semantically the same event isn't sufficient, since
   277  	// concurrent invocations of a PM must agree on which events can be deleted
   278  	// and which must be kept.
   279  	noops eventbox.Events
   280  
   281  	// newConfig stores newConfig event with the largest ID if any.
   282  	newConfig eventbox.Events
   283  	// poke stores Poke event with the largest ID if any.
   284  	poke eventbox.Events
   285  
   286  	clsUpdated struct {
   287  		// maps CLID to latest EVersion.
   288  		clEVersions map[int64]int64
   289  		// maps CLID to event ID of CLUpdated or CLsUpdated events.
   290  		clEvents map[int64]string
   291  		// initially, all events. removeCLUpdateNoops() leaves only referenced ones.
   292  		events eventbox.Events
   293  	}
   294  	runsCreated struct {
   295  		// events and runs are in random order.
   296  		events eventbox.Events
   297  		runs   common.RunIDs
   298  	}
   299  	runsFinished struct {
   300  		events eventbox.Events
   301  		runs   map[common.RunID]run.Status
   302  	}
   303  	purgesCompleted struct {
   304  		events eventbox.Events
   305  		purges []*prjpb.PurgeCompleted
   306  	}
   307  	triggeringCLDepsCompleted struct {
   308  		events   eventbox.Events
   309  		triggers []*prjpb.TriggeringCLDepsCompleted
   310  	}
   311  }
   312  
   313  func (tr *triageResult) triage(ctx context.Context, item eventbox.Event) {
   314  	e := &prjpb.Event{}
   315  	if err := proto.Unmarshal(item.Value, e); err != nil {
   316  		// This is a bug in code or data corruption.
   317  		// There is no way to recover on its own.
   318  		logging.Errorf(ctx, "CRITICAL: failed to deserialize event %q: %s", item.ID, err)
   319  		panic(err)
   320  	}
   321  	switch v := e.GetEvent().(type) {
   322  	case *prjpb.Event_NewConfig:
   323  		tr.highestIDWins(item, &tr.newConfig)
   324  	case *prjpb.Event_Poke:
   325  		tr.highestIDWins(item, &tr.poke)
   326  
   327  	case *prjpb.Event_ClsUpdated:
   328  		tr.clsUpdated.events = append(tr.clsUpdated.events, item)
   329  		for _, cl := range v.ClsUpdated.GetEvents() {
   330  			tr.triageCLUpdated(cl, item.ID)
   331  		}
   332  
   333  	case *prjpb.Event_RunCreated:
   334  		tr.runsCreated.events = append(tr.runsCreated.events, item)
   335  		tr.runsCreated.runs = append(tr.runsCreated.runs, common.RunID(v.RunCreated.GetRunId()))
   336  	case *prjpb.Event_RunFinished:
   337  		tr.runsFinished.events = append(tr.runsFinished.events, item)
   338  		if tr.runsFinished.runs == nil {
   339  			tr.runsFinished.runs = make(map[common.RunID]run.Status)
   340  		}
   341  		tr.runsFinished.runs[common.RunID(v.RunFinished.GetRunId())] = v.RunFinished.GetStatus()
   342  	case *prjpb.Event_PurgeCompleted:
   343  		tr.purgesCompleted.events = append(tr.purgesCompleted.events, item)
   344  		tr.purgesCompleted.purges = append(tr.purgesCompleted.purges, v.PurgeCompleted)
   345  	case *prjpb.Event_TriggeringClDepsCompleted:
   346  		tr.triggeringCLDepsCompleted.events = append(tr.triggeringCLDepsCompleted.events, item)
   347  		tr.triggeringCLDepsCompleted.triggers = append(tr.triggeringCLDepsCompleted.triggers, v.TriggeringClDepsCompleted)
   348  	default:
   349  		panic(fmt.Errorf("unknown event: %T [id=%q]", e.GetEvent(), item.ID))
   350  	}
   351  }
   352  
   353  func (tr *triageResult) highestIDWins(item eventbox.Event, target *eventbox.Events) {
   354  	if len(*target) == 0 {
   355  		*target = eventbox.Events{item}
   356  		return
   357  	}
   358  	if i := (*target)[0]; i.ID < item.ID {
   359  		tr.noops = append(tr.noops, i)
   360  		(*target)[0] = item
   361  	} else {
   362  		tr.noops = append(tr.noops, item)
   363  	}
   364  }
   365  
   366  func (tr *triageResult) triageCLUpdated(v *changelist.CLUpdatedEvent, id string) {
   367  	clid := v.GetClid()
   368  	ev := v.GetEversion()
   369  
   370  	cu := &tr.clsUpdated
   371  	if curEV, exists := cu.clEVersions[v.GetClid()]; !exists || curEV < ev {
   372  		if cu.clEVersions == nil {
   373  			cu.clEVersions = make(map[int64]int64, 1)
   374  			cu.clEvents = make(map[int64]string, 1)
   375  		}
   376  		cu.clEVersions[clid] = ev
   377  		cu.clEvents[clid] = id
   378  	}
   379  }
   380  
   381  func (tr *triageResult) removeCLUpdateNoops() {
   382  	cu := &tr.clsUpdated
   383  	eventIDs := stringset.New(len(cu.clEvents))
   384  	for _, id := range cu.clEvents {
   385  		eventIDs.Add(id)
   386  	}
   387  	remaining := cu.events[:0]
   388  	for _, e := range cu.events {
   389  		if eventIDs.Has(e.ID) {
   390  			remaining = append(remaining, e)
   391  		} else {
   392  			tr.noops = append(tr.noops, e)
   393  		}
   394  	}
   395  	cu.events = remaining
   396  	cu.clEvents = nil // free memory
   397  }
   398  
   399  func (proc *pmProcessor) mutate(ctx context.Context, tr *triageResult, s *state.State) ([]eventbox.Transition, error) {
   400  	var err error
   401  	var se state.SideEffect
   402  	ret := make([]eventbox.Transition, 0, 7)
   403  	var evIndexesToConsume []int
   404  
   405  	if upgraded := s.UpgradeIfNecessary(); upgraded != s {
   406  		ret = append(ret, eventbox.Transition{TransitionTo: upgraded})
   407  		s = upgraded
   408  	}
   409  
   410  	// Visit all non-empty fields of triageResult and emit Transitions.
   411  	// The order of visits matters.
   412  
   413  	// Even though OnRunCreated event is sent before OnRunFinished event,
   414  	// under rare conditions it's possible that OnRunsFinished will be read first,
   415  	// and OnRunsCreated will be read only in the next PM invocation
   416  	// (see https://crbug.com/1218681 for a concrete example).
   417  	if len(tr.runsCreated.runs) > 0 {
   418  		if s, se, err = proc.handler.OnRunsCreated(ctx, s, tr.runsCreated.runs); err != nil {
   419  			return nil, err
   420  		}
   421  		ret = append(ret, eventbox.Transition{
   422  			Events:       tr.runsCreated.events,
   423  			SideEffectFn: state.SideEffectFn(se),
   424  			TransitionTo: s,
   425  		})
   426  	}
   427  
   428  	if len(tr.runsFinished.runs) > 0 {
   429  		if s, se, err = proc.handler.OnRunsFinished(ctx, s, tr.runsFinished.runs); err != nil {
   430  			return nil, err
   431  		}
   432  		ret = append(ret, eventbox.Transition{
   433  			Events:       tr.runsFinished.events,
   434  			SideEffectFn: state.SideEffectFn(se),
   435  			TransitionTo: s,
   436  		})
   437  	}
   438  
   439  	// UpdateConfig event may result in stopping the PM, which requires notifying
   440  	// each of the incomplete Runs to stop. Thus, runsCreated must be processed
   441  	// before to ensure no Run will be missed.
   442  	if len(tr.newConfig) > 0 {
   443  		if s, se, err = proc.handler.UpdateConfig(ctx, s); err != nil {
   444  			return nil, err
   445  		}
   446  		ret = append(ret, eventbox.Transition{
   447  			Events:       tr.newConfig,
   448  			SideEffectFn: state.SideEffectFn(se),
   449  			TransitionTo: s,
   450  		})
   451  	}
   452  
   453  	if len(tr.poke) > 0 {
   454  		if s, se, err = proc.handler.Poke(ctx, s); err != nil {
   455  			return nil, err
   456  		}
   457  		ret = append(ret, eventbox.Transition{
   458  			Events:       tr.poke,
   459  			SideEffectFn: state.SideEffectFn(se),
   460  			TransitionTo: s,
   461  		})
   462  	}
   463  
   464  	if len(tr.clsUpdated.clEVersions) > 0 {
   465  		if s, se, err = proc.handler.OnCLsUpdated(ctx, s, tr.clsUpdated.clEVersions); err != nil {
   466  			return nil, err
   467  		}
   468  		ret = append(ret, eventbox.Transition{
   469  			Events:       tr.clsUpdated.events,
   470  			SideEffectFn: state.SideEffectFn(se),
   471  			TransitionTo: s,
   472  		})
   473  	}
   474  
   475  	// OnPurgesCompleted may expire purges even without incoming event.
   476  	if s, se, evIndexesToConsume, err = proc.handler.OnPurgesCompleted(ctx, s, tr.purgesCompleted.purges); err != nil {
   477  		return nil, err
   478  	}
   479  	ret = append(ret, eventbox.Transition{
   480  		Events:       shallowCopyEvents(tr.purgesCompleted.events, evIndexesToConsume),
   481  		SideEffectFn: state.SideEffectFn(se),
   482  		TransitionTo: s,
   483  	})
   484  
   485  	// OnTriggeringCLDepsCompleted may expire triggers even without incoming event.
   486  	s, se, evIndexesToConsume, err = proc.handler.OnTriggeringCLDepsCompleted(ctx, s,
   487  		tr.triggeringCLDepsCompleted.triggers,
   488  	)
   489  	if err != nil {
   490  		return nil, err
   491  	}
   492  	ret = append(ret, eventbox.Transition{
   493  		Events:       shallowCopyEvents(tr.triggeringCLDepsCompleted.events, evIndexesToConsume),
   494  		SideEffectFn: state.SideEffectFn(se),
   495  		TransitionTo: s,
   496  	})
   497  
   498  	if s, se, err = proc.handler.ExecDeferred(ctx, s); err != nil {
   499  		return nil, err
   500  	}
   501  	return append(ret, eventbox.Transition{
   502  		SideEffectFn: state.SideEffectFn(se),
   503  		TransitionTo: s,
   504  	}), nil
   505  }
   506  
   507  func shallowCopyEvents(events []eventbox.Event, indexesToCopy []int) []eventbox.Event {
   508  	if len(events) == len(indexesToCopy) {
   509  		return events
   510  	}
   511  	ret := make([]eventbox.Event, len(events))
   512  	for i, index := range indexesToCopy {
   513  		ret[i] = events[index]
   514  	}
   515  	return ret
   516  }