go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/server/tsmon/middleware.go (about) 1 // Copyright 2017 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tsmon 16 17 import ( 18 "context" 19 "sync" 20 "time" 21 22 "go.chromium.org/luci/common/clock" 23 "go.chromium.org/luci/common/errors" 24 "go.chromium.org/luci/common/iotools" 25 "go.chromium.org/luci/common/logging" 26 "go.chromium.org/luci/common/tsmon" 27 "go.chromium.org/luci/common/tsmon/metric" 28 "go.chromium.org/luci/common/tsmon/monitor" 29 "go.chromium.org/luci/common/tsmon/runtimestats" 30 "go.chromium.org/luci/common/tsmon/store" 31 "go.chromium.org/luci/common/tsmon/target" 32 "go.chromium.org/luci/common/tsmon/versions" 33 34 "go.chromium.org/luci/server/router" 35 ) 36 37 // State holds the state and configuration of the tsmon library. 38 // 39 // Define it as a global variable and inject it in the request contexts using 40 // State.Middleware(). 41 // 42 // It will initialize itself from the tsmon state in the passed context on 43 // a first use, mutating it along the way. Assumes caller is consistently using 44 // contexts configured with exact same tsmon state (in a vast majority of cases 45 // it would be global tsmon state that corresponds to context.Background, but 46 // unit tests may provide its own state). 47 // 48 // Will panic if it detects that caller has changed tsmon state in the context 49 // between the requests. 50 type State struct { 51 // Target is lazily called to initialize default metrics target. 52 // 53 // The target identifies the collection of homogeneous processes that together 54 // implement the service. Each individual process in the collection is 55 // additionally identified by a task number, later (optionally) dynamically 56 // assigned via TaskNumAllocator based on unique InstanceID. 57 Target func(ctx context.Context) target.Task 58 59 // InstanceID returns a unique (within the scope of the service) identifier of 60 // this particular process. 61 // 62 // It will be used to assign a free task number via TaskNumAllocator. 63 // 64 // If nil, instance ID will be set to "". Useful when TaskNumAllocator is nil 65 // too (then instance ID is not important). 66 InstanceID func(ctx context.Context) string 67 68 // TaskNumAllocator knows how to dynamically map instance ID to a task number. 69 // 70 // If nil, 0 will be used as task number regardless of instance ID. 71 TaskNumAllocator TaskNumAllocator 72 73 // CustomMonitor, if not nil, is used to flush accumulated metrics instead of 74 // the default monitor (which is constructed on the fly based on current 75 // settings). 76 // 77 // Useful to override the default monitor in tests or when running locally in 78 // the debug mode. 79 CustomMonitor monitor.Monitor 80 81 // FlushInMiddleware is true to make Middleware(...) periodically 82 // synchronously send metrics to the backend after handling a request. 83 // 84 // This is useful on Standard GAE that doesn't support asynchronous flushes 85 // outside of a context of some request. 86 // 87 // If false, the user of the library *must* launch FlushPeriodically() in 88 // a background goroutine. Otherwise metrics won't be sent. 89 FlushInMiddleware bool 90 91 // Settings, if non nil, are static preset settings to use. 92 // 93 // If nil, settings will be loaded dynamically through 'settings' module. 94 Settings *Settings 95 96 lock sync.RWMutex 97 98 state *tsmon.State 99 lastSettings Settings 100 101 instanceID string // cached result of InstanceID() call 102 flushingNow bool // true if some goroutine is flushing right now 103 nextFlush time.Time // next time we should do the flush 104 lastFlush time.Time // last successful flush 105 flushRetry time.Duration // flush retry delay 106 } 107 108 const ( 109 // noFlushErrorThreshold defines when we start to complain in error log that 110 // the last successful flush (if ever) was too long ago. 111 noFlushErrorThreshold = 5 * time.Minute 112 113 // flushMaxRetry defines the maximum delay between flush retries. 114 flushMaxRetry = 10 * time.Minute 115 116 // flushInitialRetry defines the initial retry delay. This 117 // is doubled every retry, up to flushMaxRetry. 118 flushInitialRetry = 5 * time.Second 119 ) 120 121 // Activate updates the tsmon state in the context to use this config. 122 // 123 // Not needed if metrics are updated only from inside the middleware. 124 func (s *State) Activate(ctx context.Context) { 125 s.checkSettings(ctx) 126 } 127 128 // Middleware is a middleware that collects request metrics and triggers metric 129 // flushes. 130 func (s *State) Middleware(c *router.Context, next router.Handler) { 131 state, settings := s.checkSettings(c.Request.Context()) 132 if settings.Enabled { 133 started := clock.Now(c.Request.Context()) 134 req := c.Request 135 userAgent, ok := req.Header["User-Agent"] 136 if !ok || len(userAgent) == 0 { 137 userAgent = []string{"Unknown"} 138 } 139 ctx := c.Request.Context() 140 contentLength := c.Request.ContentLength 141 nrw := iotools.NewResponseWriter(c.Writer) 142 c.Writer = nrw 143 defer func() { 144 dur := clock.Now(ctx).Sub(started) 145 metric.UpdateServerMetrics(ctx, c.HandlerPath, nrw.Status(), dur, 146 contentLength, nrw.ResponseSize(), userAgent[0]) 147 }() 148 next(c) 149 if s.FlushInMiddleware { 150 s.flushIfNeededImpl(ctx, state, settings) 151 } 152 } else { 153 next(c) 154 } 155 } 156 157 // FlushPeriodically runs a loop that periodically flushes metrics. 158 // 159 // Blocks until the context is canceled. Handles (and logs) errors internally. 160 func (s *State) FlushPeriodically(ctx context.Context) { 161 for { 162 var nextFlush time.Time 163 state, settings := s.checkSettings(ctx) 164 if settings.Enabled { 165 nextFlush = s.flushIfNeededImpl(ctx, state, settings) 166 } 167 // Don't sleep less than 1 sec to avoid busy looping. It is always OK to 168 // flush a sec later. In most cases we'll be sleeping ~= FlushIntervalSec. 169 sleep := time.Second 170 if dt := nextFlush.Sub(clock.Now(ctx)); dt > sleep { 171 sleep = dt 172 } 173 if r := <-clock.After(ctx, sleep); r.Err != nil { 174 return // the context is canceled 175 } 176 } 177 } 178 179 // checkSettings fetches tsmon settings and initializes, reinitializes or 180 // deinitializes tsmon, as needed. 181 // 182 // Returns current tsmon state and settings. Panics if the context is using 183 // unexpected tsmon state. 184 func (s *State) checkSettings(ctx context.Context) (*tsmon.State, *Settings) { 185 state := tsmon.GetState(ctx) 186 187 var settings Settings 188 if s.Settings != nil { 189 settings = *s.Settings 190 } else { 191 settings = fetchCachedSettings(ctx) 192 } 193 194 // Read the values used when handling previous request. In most cases they 195 // are identical to current ones and we can skip grabbing a heavier write 196 // lock. 197 s.lock.RLock() 198 if s.state == state && s.lastSettings == settings { 199 s.lock.RUnlock() 200 return state, &settings 201 } 202 s.lock.RUnlock() 203 204 // 'settings' or 'state' has changed. Reinitialize tsmon as needed under 205 // the write lock. 206 s.lock.Lock() 207 defer s.lock.Unlock() 208 209 // First call to 'checkSettings' ever? 210 if s.state == nil { 211 s.state = state 212 s.state.SetMonitor(monitor.NewNilMonitor()) 213 s.state.InhibitGlobalCallbacksOnFlush() 214 if s.InstanceID != nil { 215 s.instanceID = s.InstanceID(ctx) 216 } 217 } else if state != s.state { 218 panic("tsmon state in the context was unexpectedly changed between requests") 219 } 220 221 switch { 222 case !bool(s.lastSettings.Enabled) && bool(settings.Enabled): 223 s.enableTSMon(ctx, settings.ProdXAccount) 224 case bool(s.lastSettings.Enabled) && !bool(settings.Enabled): 225 s.disableTSMon(ctx) 226 } 227 s.lastSettings = settings 228 229 return state, &settings 230 } 231 232 // enableTSMon puts in-memory metrics store in the context's tsmon state. 233 // 234 // Called with 's.lock' locked. 235 func (s *State) enableTSMon(ctx context.Context, prodXAccount string) { 236 t := s.Target(ctx) 237 t.TaskNum = -1 // will be assigned later via TaskNumAllocator 238 239 s.state.SetStore(store.NewInMemory(&t)) 240 241 // Request the flush to be executed ASAP, so it registers a claim for a task 242 // number via NotifyTaskIsAlive. Also reset 'lastFlush', so that we don't get 243 // invalid logging that the last flush was long time ago. 244 s.nextFlush = clock.Now(ctx) 245 s.lastFlush = s.nextFlush 246 // Set initial value for retry delay. 247 s.flushRetry = flushInitialRetry 248 249 mon := s.CustomMonitor 250 if mon == nil { 251 prodx, err := NewProdXMonitor(ctx, 500, prodXAccount) 252 if err != nil { 253 logging.WithError(err).Errorf(ctx, "failed to dial ProdX; using NilMonitor()") 254 mon = monitor.NewNilMonitor() 255 } else { 256 mon = prodx 257 } 258 } 259 s.state.SetMonitor(mon) 260 } 261 262 // disableTSMon puts nil metrics store in the context's tsmon state. 263 // 264 // Called with 's.lock' locked. 265 func (s *State) disableTSMon(ctx context.Context) { 266 s.state.SetStore(store.NewNilStore()) 267 if mon := s.state.Monitor(); mon != nil { 268 if err := mon.Close(); err != nil { 269 logging.WithError(err).Errorf(ctx, "failed to close tsmon monitor; ignoring") 270 } 271 } 272 } 273 274 // flushIfNeededImpl periodically flushes the accumulated metrics. 275 // 276 // It skips the flush if some other goroutine is already flushing. Logs errors. 277 // 278 // Returns a timestamp when the next flush should occur. It may be in the past 279 // if the flush is happening concurrently right now in another goroutine. 280 // 281 // TODO(vadimsh): Refactor flushIfNeededImpl + FlushPeriodically to be less 282 // awkward. Historically, flushIfNeededImpl was used only from Middleware and 283 // FlushPeriodically was slapped on top later. 284 func (s *State) flushIfNeededImpl(ctx context.Context, state *tsmon.State, settings *Settings) (nextFlush time.Time) { 285 // Used to compare 'nextFlush' to 'now'. Needed to make sure we really do 286 // the flush after sleeping in FlushPeriodically even if we happened to wake 287 // up a few nanoseconds earlier. 288 const epsilonT = 100 * time.Millisecond 289 290 now := clock.Now(ctx) 291 292 // Most of the time the flush is not needed and we can get away with 293 // lightweight RLock. 294 s.lock.RLock() 295 nextFlush = s.nextFlush 296 skip := s.flushingNow || now.Add(epsilonT).Before(nextFlush) 297 s.lock.RUnlock() 298 if skip { 299 return 300 } 301 302 // Need to flush. Update flushingNow. Redo the check under write lock, as well 303 // as do a bunch of other calls while we hold the lock. Will be useful later. 304 s.lock.Lock() 305 lastFlush := s.lastFlush 306 nextFlush = s.nextFlush 307 skip = s.flushingNow || now.Add(epsilonT).Before(nextFlush) 308 if !skip { 309 s.flushingNow = true 310 } 311 s.lock.Unlock() 312 if skip { 313 return 314 } 315 316 // The flush must be fast. Limit it by some timeout. 317 timeout := settings.FlushTimeoutSec 318 if timeout == 0 { 319 timeout = defaultSettings.FlushTimeoutSec 320 } 321 ctx, cancel := clock.WithTimeout(ctx, time.Duration(timeout)*time.Second) 322 defer cancel() 323 324 // Report per-process statistic. 325 versions.Report(ctx) 326 if settings.ReportRuntimeStats { 327 runtimestats.Report(ctx) 328 } 329 330 // Unset 'flushingNow' no matter what (even on panics). 331 // If flush has failed, retry with back off to avoid 332 // hammering the receiver. 333 var err error 334 defer func() { 335 s.lock.Lock() 336 defer s.lock.Unlock() 337 s.flushingNow = false 338 if err == nil || err == ErrNoTaskNumber { 339 // Reset retry delay. 340 s.flushRetry = flushInitialRetry 341 s.nextFlush = now.Add(time.Duration(settings.FlushIntervalSec) * time.Second) 342 if err == nil { 343 s.lastFlush = now 344 } 345 } else { 346 // Flush has failed, back off the next flush. 347 if s.flushRetry < flushMaxRetry { 348 s.flushRetry *= 2 349 } 350 s.nextFlush = now.Add(s.flushRetry) 351 } 352 nextFlush = s.nextFlush 353 }() 354 355 err = s.ensureTaskNumAndFlush(ctx, state, settings) 356 if err != nil { 357 if err == ErrNoTaskNumber { 358 logging.Warningf(ctx, "Skipping the tsmon flush: no task number assigned yet") 359 } else { 360 logging.WithError(err).Errorf(ctx, "Failed to flush tsmon metrics (tried to act as %q)", settings.ProdXAccount) 361 } 362 if sinceLastFlush := now.Sub(lastFlush); sinceLastFlush > noFlushErrorThreshold { 363 logging.Errorf(ctx, "No successful tsmon flush for %s", sinceLastFlush) 364 if s.TaskNumAllocator != nil { 365 logging.Errorf(ctx, "Is /internal/cron/ts_mon/housekeeping running?") 366 } 367 } 368 } 369 370 // 'nextFlush' return value is constructed in the defer. 371 return 372 } 373 374 // ensureTaskNumAndFlush gets a task number assigned to the process and flushes 375 // the metrics. 376 // 377 // Returns ErrNoTaskNumber if the task wasn't assigned a task number yet. 378 func (s *State) ensureTaskNumAndFlush(ctx context.Context, state *tsmon.State, settings *Settings) error { 379 task := *state.Store().DefaultTarget().(*target.Task) 380 if s.TaskNumAllocator != nil { 381 // Notify the task number allocator that we are still alive and grab 382 // the TaskNum assigned to us. 383 switch num, err := s.TaskNumAllocator.NotifyTaskIsAlive(ctx, &task, s.instanceID); { 384 case err == nil: 385 task.TaskNum = int32(num) 386 case err == ErrNoTaskNumber: 387 if task.TaskNum >= 0 { 388 logging.Warningf(ctx, "The task was inactive for too long and lost its task number, clearing cumulative metrics") 389 state.ResetCumulativeMetrics(ctx) 390 391 // set the task num to -1 to stop resetting cumulative metrics, 392 // in case ErrNoTaskNumber is repeated. 393 task.TaskNum = -1 394 state.Store().SetDefaultTarget(&task) 395 } 396 return err 397 default: 398 return errors.Annotate(err, "failed to get a new task num for %q", s.instanceID).Err() 399 } 400 } else { 401 task.TaskNum = 0 402 } 403 state.Store().SetDefaultTarget(&task) 404 return state.ParallelFlush(ctx, state.Monitor(), 4) 405 }