go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/appengine/tsmon/tasknum.go (about)

     1  // Copyright 2016 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tsmon
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"time"
    21  
    22  	"go.chromium.org/luci/gae/filter/dscache"
    23  	"go.chromium.org/luci/gae/service/datastore"
    24  	"go.chromium.org/luci/gae/service/info"
    25  
    26  	"go.chromium.org/luci/common/clock"
    27  	"go.chromium.org/luci/common/errors"
    28  	"go.chromium.org/luci/common/logging"
    29  	"go.chromium.org/luci/common/sync/parallel"
    30  	"go.chromium.org/luci/common/tsmon/target"
    31  	"go.chromium.org/luci/server/tsmon"
    32  )
    33  
    34  // DatastoreNamespace is a datastore namespace with all tsmon state.
    35  const DatastoreNamespace = "ts_mon_instance_namespace"
    36  
    37  // DatastoreTaskNumAllocator implements TaskNumAllocator on top of datastore.
    38  //
    39  // Its NotifyTaskIsAlive registers a claim for a task number, which is later
    40  // fulfilled by the housekeeping cron (see AssignTaskNumbers).
    41  type DatastoreTaskNumAllocator struct {
    42  }
    43  
    44  // NotifyTaskIsAlive is part of TaskNumAllocator interface.
    45  func (DatastoreTaskNumAllocator) NotifyTaskIsAlive(ctx context.Context, task *target.Task, instanceID string) (taskNum int, err error) {
    46  	ctx = dsContext(ctx)
    47  
    48  	// Exact values here are not important. Important properties are:
    49  	//  * 'entityID' is unique, and depends on both 'task' and 'instanceID'.
    50  	//  * All instances from same task have same 'target' value.
    51  	//
    52  	// The cron ('AssignTaskNumbers') will fetch all entities and will group them
    53  	// by 'target' value before assigning task numbers.
    54  	target := fmt.Sprintf("%s|%s|%s|%s", task.DataCenter, task.ServiceName, task.JobName, task.HostName)
    55  	entityID := fmt.Sprintf("%s|%s", target, instanceID)
    56  
    57  	err = datastore.RunInTransaction(ctx, func(ctx context.Context) error {
    58  		entity := instance{ID: entityID}
    59  		switch err := datastore.Get(ctx, &entity); {
    60  		case err == datastore.ErrNoSuchEntity:
    61  			entity.Target = target
    62  			entity.TaskNum = -1
    63  		case err != nil:
    64  			return err
    65  		}
    66  		entity.LastUpdated = clock.Now(ctx).UTC()
    67  		taskNum = entity.TaskNum
    68  		return datastore.Put(ctx, &entity)
    69  	}, nil)
    70  	if err == nil && taskNum == -1 {
    71  		err = tsmon.ErrNoTaskNumber
    72  	}
    73  	return
    74  }
    75  
    76  // AssignTaskNumbers updates the set of task number requests created with
    77  // DatastoreTaskNumAllocator.
    78  //
    79  // It assigns unique task numbers to those without ones set, and expires old
    80  // ones (thus reclaiming task numbers assigned to them).
    81  //
    82  // Must be used from some (global per project) cron if DatastoreTaskNumAllocator
    83  // is used. Use 'InstallHandlers' to install the corresponding cron handler.
    84  func AssignTaskNumbers(ctx context.Context) error {
    85  	ctx = dsContext(ctx)
    86  
    87  	now := clock.Now(ctx)
    88  	cutoff := now.Add(-instanceExpirationTimeout)
    89  	perTarget := map[string]*workingSet{}
    90  
    91  	// Enumerate all instances stored in the datastore and expire old ones (in
    92  	// batches). Collect a set of used task numbers and a set of instances not
    93  	// assigned a number yet. Group by 'Target' (can be "" for old entities, this
    94  	// is fine).
    95  	q := datastore.NewQuery("Instance")
    96  	err := datastore.RunBatch(ctx, int32(taskQueryBatchSize), q, func(entity *instance) error {
    97  		set := perTarget[entity.Target]
    98  		if set == nil {
    99  			set = newWorkingSet(cutoff)
   100  			perTarget[entity.Target] = set
   101  		}
   102  		set.addInstance(ctx, entity)
   103  		if len(set.expired) >= taskQueryBatchSize {
   104  			if err := set.cleanupExpired(ctx); err != nil {
   105  				return err
   106  			}
   107  		}
   108  		return nil
   109  	})
   110  	if err != nil {
   111  		return errors.Annotate(err, "failed to enumerate or expire entries").Err()
   112  	}
   113  
   114  	return parallel.FanOutIn(func(tasks chan<- func() error) {
   115  		for target, set := range perTarget {
   116  			target := target
   117  			set := set
   118  
   119  			tasks <- func() error {
   120  				// "Flush" all pending expired instances.
   121  				if err := set.cleanupExpired(ctx); err != nil {
   122  					logging.WithError(err).Errorf(ctx, "Failed to delete expired entries for target %q", target)
   123  					return err
   124  				}
   125  				// Assign task numbers to those that don't have one assigned yet.
   126  				logging.Debugf(ctx, "Found %d expired and %d unassigned instances for target %q", set.totalExpired, len(set.pending), target)
   127  				if err := set.assignTaskNumbers(ctx); err != nil {
   128  					logging.WithError(err).Errorf(ctx, "Failed to assign task numbers for target %q", target)
   129  					return err
   130  				}
   131  				return nil
   132  			}
   133  		}
   134  	})
   135  }
   136  
   137  ////////////////////////////////////////////////////////////////////////////////
   138  
   139  const (
   140  	instanceExpirationTimeout = 30 * time.Minute
   141  	taskQueryBatchSize        = 500
   142  )
   143  
   144  // dsContext is used for all datastore accesses that touch 'instance' entities.
   145  //
   146  // It switches the namespace and disables dscache, since these entities are
   147  // updated from Flex, which doesn't work with dscache. Besides, all reads happen
   148  // either in transactions or through queries - dscache is useless anyhow.
   149  func dsContext(ctx context.Context) context.Context {
   150  	ctx = info.MustNamespace(ctx, DatastoreNamespace)
   151  	return dscache.AddShardFunctions(ctx, func(*datastore.Key) (shards int, ok bool) {
   152  		return 0, true
   153  	})
   154  }
   155  
   156  // instance corresponds to one process that flushes metrics.
   157  type instance struct {
   158  	_kind  string                `gae:"$kind,Instance"`
   159  	_extra datastore.PropertyMap `gae:"-,extra"`
   160  
   161  	ID          string    `gae:"$id"`
   162  	Target      string    `gae:"target,noindex"`
   163  	TaskNum     int       `gae:"task_num,noindex"`
   164  	LastUpdated time.Time `gae:"last_updated,noindex"`
   165  
   166  	// Disable dscache to allow these entities be updated from Flex, which doesn't
   167  	// work with dscache. Besides, we update these entities from transactions or
   168  	// based on queries - dscache is useless anywhere.
   169  	_ datastore.Toggle `gae:"$dscache.enable,false"`
   170  }
   171  
   172  // workingSet is used internally by AssignTaskNumbers.
   173  type workingSet struct {
   174  	cutoff       time.Time        // if LastUpdate is before => instance has expired
   175  	expired      []*datastore.Key // entities with LastUpdate too long ago
   176  	pending      []*instance      // entities with TaskNum is still -1
   177  	assignedNums map[int]struct{} // assigned already task numbers
   178  
   179  	totalExpired int // total number of entities deleted
   180  }
   181  
   182  func newWorkingSet(cutoff time.Time) *workingSet {
   183  	return &workingSet{
   184  		cutoff:       cutoff,
   185  		assignedNums: map[int]struct{}{},
   186  	}
   187  }
   188  
   189  func (s *workingSet) addInstance(ctx context.Context, entity *instance) {
   190  	switch {
   191  	case entity.LastUpdated.Before(s.cutoff):
   192  		logging.Debugf(ctx, "Expiring %q (task_num %d), inactive since %s",
   193  			entity.ID, entity.TaskNum, entity.LastUpdated)
   194  		s.expired = append(s.expired, datastore.KeyForObj(ctx, entity))
   195  	case entity.TaskNum < 0:
   196  		s.pending = append(s.pending, entity)
   197  	default:
   198  		s.assignedNums[entity.TaskNum] = struct{}{}
   199  	}
   200  }
   201  
   202  func (s *workingSet) cleanupExpired(ctx context.Context) error {
   203  	if len(s.expired) == 0 {
   204  		return nil
   205  	}
   206  
   207  	logging.Debugf(ctx, "Expiring %d instance(s)", len(s.expired))
   208  	if err := datastore.Delete(ctx, s.expired); err != nil {
   209  		return err
   210  	}
   211  
   212  	s.totalExpired += len(s.expired)
   213  	s.expired = s.expired[:0]
   214  	return nil
   215  }
   216  
   217  func (s *workingSet) assignTaskNumbers(ctx context.Context) error {
   218  	if len(s.pending) == 0 {
   219  		return nil
   220  	}
   221  
   222  	nextNum := gapFinder(s.assignedNums)
   223  	for _, entity := range s.pending {
   224  		entity.TaskNum = nextNum()
   225  		logging.Debugf(ctx, "Assigned %q task_num %d", entity.ID, entity.TaskNum)
   226  	}
   227  
   228  	// Update all pending entities. This is non-transactional, meaning:
   229  	//  * We may override newer LastUpdated - no big deal.
   230  	//  * If there are two parallel 'AssignTaskNumbers', they'll screw up each
   231  	//    other. GAE cron gives some protection against concurrent cron job
   232  	//    executions though.
   233  	return datastore.Put(ctx, s.pending)
   234  }
   235  
   236  func gapFinder(used map[int]struct{}) func() int {
   237  	next := 0
   238  	return func() int {
   239  		for {
   240  			n := next
   241  			next++
   242  			_, has := used[n]
   243  			if !has {
   244  				return n
   245  			}
   246  		}
   247  	}
   248  }