go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/server/tsmon/middleware.go

go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/server/tsmon/middleware.go (about)

     1  // Copyright 2017 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tsmon
    16  
    17  import (
    18  	"context"
    19  	"sync"
    20  	"time"
    21  
    22  	"go.chromium.org/luci/common/clock"
    23  	"go.chromium.org/luci/common/errors"
    24  	"go.chromium.org/luci/common/iotools"
    25  	"go.chromium.org/luci/common/logging"
    26  	"go.chromium.org/luci/common/tsmon"
    27  	"go.chromium.org/luci/common/tsmon/metric"
    28  	"go.chromium.org/luci/common/tsmon/monitor"
    29  	"go.chromium.org/luci/common/tsmon/runtimestats"
    30  	"go.chromium.org/luci/common/tsmon/store"
    31  	"go.chromium.org/luci/common/tsmon/target"
    32  	"go.chromium.org/luci/common/tsmon/versions"
    33  
    34  	"go.chromium.org/luci/server/router"
    35  )
    36  
    37  // State holds the state and configuration of the tsmon library.
    38  //
    39  // Define it as a global variable and inject it in the request contexts using
    40  // State.Middleware().
    41  //
    42  // It will initialize itself from the tsmon state in the passed context on
    43  // a first use, mutating it along the way. Assumes caller is consistently using
    44  // contexts configured with exact same tsmon state (in a vast majority of cases
    45  // it would be global tsmon state that corresponds to context.Background, but
    46  // unit tests may provide its own state).
    47  //
    48  // Will panic if it detects that caller has changed tsmon state in the context
    49  // between the requests.
    50  type State struct {
    51  	// Target is lazily called to initialize default metrics target.
    52  	//
    53  	// The target identifies the collection of homogeneous processes that together
    54  	// implement the service. Each individual process in the collection is
    55  	// additionally identified by a task number, later (optionally) dynamically
    56  	// assigned via TaskNumAllocator based on unique InstanceID.
    57  	Target func(ctx context.Context) target.Task
    58  
    59  	// InstanceID returns a unique (within the scope of the service) identifier of
    60  	// this particular process.
    61  	//
    62  	// It will be used to assign a free task number via TaskNumAllocator.
    63  	//
    64  	// If nil, instance ID will be set to "". Useful when TaskNumAllocator is nil
    65  	// too (then instance ID is not important).
    66  	InstanceID func(ctx context.Context) string
    67  
    68  	// TaskNumAllocator knows how to dynamically map instance ID to a task number.
    69  	//
    70  	// If nil, 0 will be used as task number regardless of instance ID.
    71  	TaskNumAllocator TaskNumAllocator
    72  
    73  	// CustomMonitor, if not nil, is used to flush accumulated metrics instead of
    74  	// the default monitor (which is constructed on the fly based on current
    75  	// settings).
    76  	//
    77  	// Useful to override the default monitor in tests or when running locally in
    78  	// the debug mode.
    79  	CustomMonitor monitor.Monitor
    80  
    81  	// FlushInMiddleware is true to make Middleware(...) periodically
    82  	// synchronously send metrics to the backend after handling a request.
    83  	//
    84  	// This is useful on Standard GAE that doesn't support asynchronous flushes
    85  	// outside of a context of some request.
    86  	//
    87  	// If false, the user of the library *must* launch FlushPeriodically() in
    88  	// a background goroutine. Otherwise metrics won't be sent.
    89  	FlushInMiddleware bool
    90  
    91  	// Settings, if non nil, are static preset settings to use.
    92  	//
    93  	// If nil, settings will be loaded dynamically through 'settings' module.
    94  	Settings *Settings
    95  
    96  	lock sync.RWMutex
    97  
    98  	state        *tsmon.State
    99  	lastSettings Settings
   100  
   101  	instanceID  string        // cached result of InstanceID() call
   102  	flushingNow bool          // true if some goroutine is flushing right now
   103  	nextFlush   time.Time     // next time we should do the flush
   104  	lastFlush   time.Time     // last successful flush
   105  	flushRetry  time.Duration // flush retry delay
   106  }
   107  
   108  const (
   109  	// noFlushErrorThreshold defines when we start to complain in error log that
   110  	// the last successful flush (if ever) was too long ago.
   111  	noFlushErrorThreshold = 5 * time.Minute
   112  
   113  	// flushMaxRetry defines the maximum delay between flush retries.
   114  	flushMaxRetry = 10 * time.Minute
   115  
   116  	// flushInitialRetry defines the initial retry delay. This
   117  	// is doubled every retry, up to flushMaxRetry.
   118  	flushInitialRetry = 5 * time.Second
   119  )
   120  
   121  // Activate updates the tsmon state in the context to use this config.
   122  //
   123  // Not needed if metrics are updated only from inside the middleware.
   124  func (s *State) Activate(ctx context.Context) {
   125  	s.checkSettings(ctx)
   126  }
   127  
   128  // Middleware is a middleware that collects request metrics and triggers metric
   129  // flushes.
   130  func (s *State) Middleware(c *router.Context, next router.Handler) {
   131  	state, settings := s.checkSettings(c.Request.Context())
   132  	if settings.Enabled {
   133  		started := clock.Now(c.Request.Context())
   134  		req := c.Request
   135  		userAgent, ok := req.Header["User-Agent"]
   136  		if !ok || len(userAgent) == 0 {
   137  			userAgent = []string{"Unknown"}
   138  		}
   139  		ctx := c.Request.Context()
   140  		contentLength := c.Request.ContentLength
   141  		nrw := iotools.NewResponseWriter(c.Writer)
   142  		c.Writer = nrw
   143  		defer func() {
   144  			dur := clock.Now(ctx).Sub(started)
   145  			metric.UpdateServerMetrics(ctx, c.HandlerPath, nrw.Status(), dur,
   146  				contentLength, nrw.ResponseSize(), userAgent[0])
   147  		}()
   148  		next(c)
   149  		if s.FlushInMiddleware {
   150  			s.flushIfNeededImpl(ctx, state, settings)
   151  		}
   152  	} else {
   153  		next(c)
   154  	}
   155  }
   156  
   157  // FlushPeriodically runs a loop that periodically flushes metrics.
   158  //
   159  // Blocks until the context is canceled. Handles (and logs) errors internally.
   160  func (s *State) FlushPeriodically(ctx context.Context) {
   161  	for {
   162  		var nextFlush time.Time
   163  		state, settings := s.checkSettings(ctx)
   164  		if settings.Enabled {
   165  			nextFlush = s.flushIfNeededImpl(ctx, state, settings)
   166  		}
   167  		// Don't sleep less than 1 sec to avoid busy looping. It is always OK to
   168  		// flush a sec later. In most cases we'll be sleeping ~= FlushIntervalSec.
   169  		sleep := time.Second
   170  		if dt := nextFlush.Sub(clock.Now(ctx)); dt > sleep {
   171  			sleep = dt
   172  		}
   173  		if r := <-clock.After(ctx, sleep); r.Err != nil {
   174  			return // the context is canceled
   175  		}
   176  	}
   177  }
   178  
   179  // checkSettings fetches tsmon settings and initializes, reinitializes or
   180  // deinitializes tsmon, as needed.
   181  //
   182  // Returns current tsmon state and settings. Panics if the context is using
   183  // unexpected tsmon state.
   184  func (s *State) checkSettings(ctx context.Context) (*tsmon.State, *Settings) {
   185  	state := tsmon.GetState(ctx)
   186  
   187  	var settings Settings
   188  	if s.Settings != nil {
   189  		settings = *s.Settings
   190  	} else {
   191  		settings = fetchCachedSettings(ctx)
   192  	}
   193  
   194  	// Read the values used when handling previous request. In most cases they
   195  	// are identical to current ones and we can skip grabbing a heavier write
   196  	// lock.
   197  	s.lock.RLock()
   198  	if s.state == state && s.lastSettings == settings {
   199  		s.lock.RUnlock()
   200  		return state, &settings
   201  	}
   202  	s.lock.RUnlock()
   203  
   204  	// 'settings' or 'state' has changed. Reinitialize tsmon as needed under
   205  	// the write lock.
   206  	s.lock.Lock()
   207  	defer s.lock.Unlock()
   208  
   209  	// First call to 'checkSettings' ever?
   210  	if s.state == nil {
   211  		s.state = state
   212  		s.state.SetMonitor(monitor.NewNilMonitor())
   213  		s.state.InhibitGlobalCallbacksOnFlush()
   214  		if s.InstanceID != nil {
   215  			s.instanceID = s.InstanceID(ctx)
   216  		}
   217  	} else if state != s.state {
   218  		panic("tsmon state in the context was unexpectedly changed between requests")
   219  	}
   220  
   221  	switch {
   222  	case !bool(s.lastSettings.Enabled) && bool(settings.Enabled):
   223  		s.enableTSMon(ctx, settings.ProdXAccount)
   224  	case bool(s.lastSettings.Enabled) && !bool(settings.Enabled):
   225  		s.disableTSMon(ctx)
   226  	}
   227  	s.lastSettings = settings
   228  
   229  	return state, &settings
   230  }
   231  
   232  // enableTSMon puts in-memory metrics store in the context's tsmon state.
   233  //
   234  // Called with 's.lock' locked.
   235  func (s *State) enableTSMon(ctx context.Context, prodXAccount string) {
   236  	t := s.Target(ctx)
   237  	t.TaskNum = -1 // will be assigned later via TaskNumAllocator
   238  
   239  	s.state.SetStore(store.NewInMemory(&t))
   240  
   241  	// Request the flush to be executed ASAP, so it registers a claim for a task
   242  	// number via NotifyTaskIsAlive. Also reset 'lastFlush', so that we don't get
   243  	// invalid logging that the last flush was long time ago.
   244  	s.nextFlush = clock.Now(ctx)
   245  	s.lastFlush = s.nextFlush
   246  	// Set initial value for retry delay.
   247  	s.flushRetry = flushInitialRetry
   248  
   249  	mon := s.CustomMonitor
   250  	if mon == nil {
   251  		prodx, err := NewProdXMonitor(ctx, 500, prodXAccount)
   252  		if err != nil {
   253  			logging.WithError(err).Errorf(ctx, "failed to dial ProdX; using NilMonitor()")
   254  			mon = monitor.NewNilMonitor()
   255  		} else {
   256  			mon = prodx
   257  		}
   258  	}
   259  	s.state.SetMonitor(mon)
   260  }
   261  
   262  // disableTSMon puts nil metrics store in the context's tsmon state.
   263  //
   264  // Called with 's.lock' locked.
   265  func (s *State) disableTSMon(ctx context.Context) {
   266  	s.state.SetStore(store.NewNilStore())
   267  	if mon := s.state.Monitor(); mon != nil {
   268  		if err := mon.Close(); err != nil {
   269  			logging.WithError(err).Errorf(ctx, "failed to close tsmon monitor; ignoring")
   270  		}
   271  	}
   272  }
   273  
   274  // flushIfNeededImpl periodically flushes the accumulated metrics.
   275  //
   276  // It skips the flush if some other goroutine is already flushing. Logs errors.
   277  //
   278  // Returns a timestamp when the next flush should occur. It may be in the past
   279  // if the flush is happening concurrently right now in another goroutine.
   280  //
   281  // TODO(vadimsh): Refactor flushIfNeededImpl + FlushPeriodically to be less
   282  // awkward. Historically, flushIfNeededImpl was used only from Middleware and
   283  // FlushPeriodically was slapped on top later.
   284  func (s *State) flushIfNeededImpl(ctx context.Context, state *tsmon.State, settings *Settings) (nextFlush time.Time) {
   285  	// Used to compare 'nextFlush' to 'now'. Needed to make sure we really do
   286  	// the flush after sleeping in FlushPeriodically even if we happened to wake
   287  	// up a few nanoseconds earlier.
   288  	const epsilonT = 100 * time.Millisecond
   289  
   290  	now := clock.Now(ctx)
   291  
   292  	// Most of the time the flush is not needed and we can get away with
   293  	// lightweight RLock.
   294  	s.lock.RLock()
   295  	nextFlush = s.nextFlush
   296  	skip := s.flushingNow || now.Add(epsilonT).Before(nextFlush)
   297  	s.lock.RUnlock()
   298  	if skip {
   299  		return
   300  	}
   301  
   302  	// Need to flush. Update flushingNow. Redo the check under write lock, as well
   303  	// as do a bunch of other calls while we hold the lock. Will be useful later.
   304  	s.lock.Lock()
   305  	lastFlush := s.lastFlush
   306  	nextFlush = s.nextFlush
   307  	skip = s.flushingNow || now.Add(epsilonT).Before(nextFlush)
   308  	if !skip {
   309  		s.flushingNow = true
   310  	}
   311  	s.lock.Unlock()
   312  	if skip {
   313  		return
   314  	}
   315  
   316  	// The flush must be fast. Limit it by some timeout.
   317  	timeout := settings.FlushTimeoutSec
   318  	if timeout == 0 {
   319  		timeout = defaultSettings.FlushTimeoutSec
   320  	}
   321  	ctx, cancel := clock.WithTimeout(ctx, time.Duration(timeout)*time.Second)
   322  	defer cancel()
   323  
   324  	// Report per-process statistic.
   325  	versions.Report(ctx)
   326  	if settings.ReportRuntimeStats {
   327  		runtimestats.Report(ctx)
   328  	}
   329  
   330  	// Unset 'flushingNow' no matter what (even on panics).
   331  	// If flush has failed, retry with back off to avoid
   332  	// hammering the receiver.
   333  	var err error
   334  	defer func() {
   335  		s.lock.Lock()
   336  		defer s.lock.Unlock()
   337  		s.flushingNow = false
   338  		if err == nil || err == ErrNoTaskNumber {
   339  			// Reset retry delay.
   340  			s.flushRetry = flushInitialRetry
   341  			s.nextFlush = now.Add(time.Duration(settings.FlushIntervalSec) * time.Second)
   342  			if err == nil {
   343  				s.lastFlush = now
   344  			}
   345  		} else {
   346  			// Flush has failed, back off the next flush.
   347  			if s.flushRetry < flushMaxRetry {
   348  				s.flushRetry *= 2
   349  			}
   350  			s.nextFlush = now.Add(s.flushRetry)
   351  		}
   352  		nextFlush = s.nextFlush
   353  	}()
   354  
   355  	err = s.ensureTaskNumAndFlush(ctx, state, settings)
   356  	if err != nil {
   357  		if err == ErrNoTaskNumber {
   358  			logging.Warningf(ctx, "Skipping the tsmon flush: no task number assigned yet")
   359  		} else {
   360  			logging.WithError(err).Errorf(ctx, "Failed to flush tsmon metrics (tried to act as %q)", settings.ProdXAccount)
   361  		}
   362  		if sinceLastFlush := now.Sub(lastFlush); sinceLastFlush > noFlushErrorThreshold {
   363  			logging.Errorf(ctx, "No successful tsmon flush for %s", sinceLastFlush)
   364  			if s.TaskNumAllocator != nil {
   365  				logging.Errorf(ctx, "Is /internal/cron/ts_mon/housekeeping running?")
   366  			}
   367  		}
   368  	}
   369  
   370  	// 'nextFlush' return value is constructed in the defer.
   371  	return
   372  }
   373  
   374  // ensureTaskNumAndFlush gets a task number assigned to the process and flushes
   375  // the metrics.
   376  //
   377  // Returns ErrNoTaskNumber if the task wasn't assigned a task number yet.
   378  func (s *State) ensureTaskNumAndFlush(ctx context.Context, state *tsmon.State, settings *Settings) error {
   379  	task := *state.Store().DefaultTarget().(*target.Task)
   380  	if s.TaskNumAllocator != nil {
   381  		// Notify the task number allocator that we are still alive and grab
   382  		// the TaskNum assigned to us.
   383  		switch num, err := s.TaskNumAllocator.NotifyTaskIsAlive(ctx, &task, s.instanceID); {
   384  		case err == nil:
   385  			task.TaskNum = int32(num)
   386  		case err == ErrNoTaskNumber:
   387  			if task.TaskNum >= 0 {
   388  				logging.Warningf(ctx, "The task was inactive for too long and lost its task number, clearing cumulative metrics")
   389  				state.ResetCumulativeMetrics(ctx)
   390  
   391  				// set the task num to -1 to stop resetting cumulative metrics,
   392  				// in case ErrNoTaskNumber is repeated.
   393  				task.TaskNum = -1
   394  				state.Store().SetDefaultTarget(&task)
   395  			}
   396  			return err
   397  		default:
   398  			return errors.Annotate(err, "failed to get a new task num for %q", s.instanceID).Err()
   399  		}
   400  	} else {
   401  		task.TaskNum = 0
   402  	}
   403  	state.Store().SetDefaultTarget(&task)
   404  	return state.ParallelFlush(ctx, state.Monitor(), 4)
   405  }