go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/appengine/tsmon/tasknum.go (about) 1 // Copyright 2016 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tsmon 16 17 import ( 18 "context" 19 "fmt" 20 "time" 21 22 "go.chromium.org/luci/gae/filter/dscache" 23 "go.chromium.org/luci/gae/service/datastore" 24 "go.chromium.org/luci/gae/service/info" 25 26 "go.chromium.org/luci/common/clock" 27 "go.chromium.org/luci/common/errors" 28 "go.chromium.org/luci/common/logging" 29 "go.chromium.org/luci/common/sync/parallel" 30 "go.chromium.org/luci/common/tsmon/target" 31 "go.chromium.org/luci/server/tsmon" 32 ) 33 34 // DatastoreNamespace is a datastore namespace with all tsmon state. 35 const DatastoreNamespace = "ts_mon_instance_namespace" 36 37 // DatastoreTaskNumAllocator implements TaskNumAllocator on top of datastore. 38 // 39 // Its NotifyTaskIsAlive registers a claim for a task number, which is later 40 // fulfilled by the housekeeping cron (see AssignTaskNumbers). 41 type DatastoreTaskNumAllocator struct { 42 } 43 44 // NotifyTaskIsAlive is part of TaskNumAllocator interface. 45 func (DatastoreTaskNumAllocator) NotifyTaskIsAlive(ctx context.Context, task *target.Task, instanceID string) (taskNum int, err error) { 46 ctx = dsContext(ctx) 47 48 // Exact values here are not important. Important properties are: 49 // * 'entityID' is unique, and depends on both 'task' and 'instanceID'. 50 // * All instances from same task have same 'target' value. 51 // 52 // The cron ('AssignTaskNumbers') will fetch all entities and will group them 53 // by 'target' value before assigning task numbers. 54 target := fmt.Sprintf("%s|%s|%s|%s", task.DataCenter, task.ServiceName, task.JobName, task.HostName) 55 entityID := fmt.Sprintf("%s|%s", target, instanceID) 56 57 err = datastore.RunInTransaction(ctx, func(ctx context.Context) error { 58 entity := instance{ID: entityID} 59 switch err := datastore.Get(ctx, &entity); { 60 case err == datastore.ErrNoSuchEntity: 61 entity.Target = target 62 entity.TaskNum = -1 63 case err != nil: 64 return err 65 } 66 entity.LastUpdated = clock.Now(ctx).UTC() 67 taskNum = entity.TaskNum 68 return datastore.Put(ctx, &entity) 69 }, nil) 70 if err == nil && taskNum == -1 { 71 err = tsmon.ErrNoTaskNumber 72 } 73 return 74 } 75 76 // AssignTaskNumbers updates the set of task number requests created with 77 // DatastoreTaskNumAllocator. 78 // 79 // It assigns unique task numbers to those without ones set, and expires old 80 // ones (thus reclaiming task numbers assigned to them). 81 // 82 // Must be used from some (global per project) cron if DatastoreTaskNumAllocator 83 // is used. Use 'InstallHandlers' to install the corresponding cron handler. 84 func AssignTaskNumbers(ctx context.Context) error { 85 ctx = dsContext(ctx) 86 87 now := clock.Now(ctx) 88 cutoff := now.Add(-instanceExpirationTimeout) 89 perTarget := map[string]*workingSet{} 90 91 // Enumerate all instances stored in the datastore and expire old ones (in 92 // batches). Collect a set of used task numbers and a set of instances not 93 // assigned a number yet. Group by 'Target' (can be "" for old entities, this 94 // is fine). 95 q := datastore.NewQuery("Instance") 96 err := datastore.RunBatch(ctx, int32(taskQueryBatchSize), q, func(entity *instance) error { 97 set := perTarget[entity.Target] 98 if set == nil { 99 set = newWorkingSet(cutoff) 100 perTarget[entity.Target] = set 101 } 102 set.addInstance(ctx, entity) 103 if len(set.expired) >= taskQueryBatchSize { 104 if err := set.cleanupExpired(ctx); err != nil { 105 return err 106 } 107 } 108 return nil 109 }) 110 if err != nil { 111 return errors.Annotate(err, "failed to enumerate or expire entries").Err() 112 } 113 114 return parallel.FanOutIn(func(tasks chan<- func() error) { 115 for target, set := range perTarget { 116 target := target 117 set := set 118 119 tasks <- func() error { 120 // "Flush" all pending expired instances. 121 if err := set.cleanupExpired(ctx); err != nil { 122 logging.WithError(err).Errorf(ctx, "Failed to delete expired entries for target %q", target) 123 return err 124 } 125 // Assign task numbers to those that don't have one assigned yet. 126 logging.Debugf(ctx, "Found %d expired and %d unassigned instances for target %q", set.totalExpired, len(set.pending), target) 127 if err := set.assignTaskNumbers(ctx); err != nil { 128 logging.WithError(err).Errorf(ctx, "Failed to assign task numbers for target %q", target) 129 return err 130 } 131 return nil 132 } 133 } 134 }) 135 } 136 137 //////////////////////////////////////////////////////////////////////////////// 138 139 const ( 140 instanceExpirationTimeout = 30 * time.Minute 141 taskQueryBatchSize = 500 142 ) 143 144 // dsContext is used for all datastore accesses that touch 'instance' entities. 145 // 146 // It switches the namespace and disables dscache, since these entities are 147 // updated from Flex, which doesn't work with dscache. Besides, all reads happen 148 // either in transactions or through queries - dscache is useless anyhow. 149 func dsContext(ctx context.Context) context.Context { 150 ctx = info.MustNamespace(ctx, DatastoreNamespace) 151 return dscache.AddShardFunctions(ctx, func(*datastore.Key) (shards int, ok bool) { 152 return 0, true 153 }) 154 } 155 156 // instance corresponds to one process that flushes metrics. 157 type instance struct { 158 _kind string `gae:"$kind,Instance"` 159 _extra datastore.PropertyMap `gae:"-,extra"` 160 161 ID string `gae:"$id"` 162 Target string `gae:"target,noindex"` 163 TaskNum int `gae:"task_num,noindex"` 164 LastUpdated time.Time `gae:"last_updated,noindex"` 165 166 // Disable dscache to allow these entities be updated from Flex, which doesn't 167 // work with dscache. Besides, we update these entities from transactions or 168 // based on queries - dscache is useless anywhere. 169 _ datastore.Toggle `gae:"$dscache.enable,false"` 170 } 171 172 // workingSet is used internally by AssignTaskNumbers. 173 type workingSet struct { 174 cutoff time.Time // if LastUpdate is before => instance has expired 175 expired []*datastore.Key // entities with LastUpdate too long ago 176 pending []*instance // entities with TaskNum is still -1 177 assignedNums map[int]struct{} // assigned already task numbers 178 179 totalExpired int // total number of entities deleted 180 } 181 182 func newWorkingSet(cutoff time.Time) *workingSet { 183 return &workingSet{ 184 cutoff: cutoff, 185 assignedNums: map[int]struct{}{}, 186 } 187 } 188 189 func (s *workingSet) addInstance(ctx context.Context, entity *instance) { 190 switch { 191 case entity.LastUpdated.Before(s.cutoff): 192 logging.Debugf(ctx, "Expiring %q (task_num %d), inactive since %s", 193 entity.ID, entity.TaskNum, entity.LastUpdated) 194 s.expired = append(s.expired, datastore.KeyForObj(ctx, entity)) 195 case entity.TaskNum < 0: 196 s.pending = append(s.pending, entity) 197 default: 198 s.assignedNums[entity.TaskNum] = struct{}{} 199 } 200 } 201 202 func (s *workingSet) cleanupExpired(ctx context.Context) error { 203 if len(s.expired) == 0 { 204 return nil 205 } 206 207 logging.Debugf(ctx, "Expiring %d instance(s)", len(s.expired)) 208 if err := datastore.Delete(ctx, s.expired); err != nil { 209 return err 210 } 211 212 s.totalExpired += len(s.expired) 213 s.expired = s.expired[:0] 214 return nil 215 } 216 217 func (s *workingSet) assignTaskNumbers(ctx context.Context) error { 218 if len(s.pending) == 0 { 219 return nil 220 } 221 222 nextNum := gapFinder(s.assignedNums) 223 for _, entity := range s.pending { 224 entity.TaskNum = nextNum() 225 logging.Debugf(ctx, "Assigned %q task_num %d", entity.ID, entity.TaskNum) 226 } 227 228 // Update all pending entities. This is non-transactional, meaning: 229 // * We may override newer LastUpdated - no big deal. 230 // * If there are two parallel 'AssignTaskNumbers', they'll screw up each 231 // other. GAE cron gives some protection against concurrent cron job 232 // executions though. 233 return datastore.Put(ctx, s.pending) 234 } 235 236 func gapFinder(used map[int]struct{}) func() int { 237 next := 0 238 return func() int { 239 for { 240 n := next 241 next++ 242 _, has := used[n] 243 if !has { 244 return n 245 } 246 } 247 } 248 }