go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/swarming/server/monitor/main.go (about)

     1  // Copyright 2023 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package main
    16  
    17  import (
    18  	"context"
    19  	"encoding/json"
    20  	"fmt"
    21  	"strings"
    22  
    23  	"go.chromium.org/luci/common/clock"
    24  	"go.chromium.org/luci/common/data/stringset"
    25  	"go.chromium.org/luci/common/errors"
    26  	"go.chromium.org/luci/common/logging"
    27  	"go.chromium.org/luci/common/tsmon"
    28  	"go.chromium.org/luci/common/tsmon/field"
    29  	"go.chromium.org/luci/common/tsmon/metric"
    30  	"go.chromium.org/luci/common/tsmon/monitor"
    31  	"go.chromium.org/luci/common/tsmon/store"
    32  	"go.chromium.org/luci/common/tsmon/target"
    33  	"go.chromium.org/luci/gae/service/datastore"
    34  	"go.chromium.org/luci/server"
    35  	"go.chromium.org/luci/server/cron"
    36  	"go.chromium.org/luci/server/dsmapper/dsmapperlite"
    37  	"go.chromium.org/luci/server/gaeemulation"
    38  	"go.chromium.org/luci/server/module"
    39  	tsmonsrv "go.chromium.org/luci/server/tsmon"
    40  
    41  	apipb "go.chromium.org/luci/swarming/proto/api_v2"
    42  	"go.chromium.org/luci/swarming/server/model"
    43  )
    44  
    45  func main() {
    46  	modules := []module.Module{
    47  		cron.NewModuleFromFlags(),
    48  		gaeemulation.NewModuleFromFlags(),
    49  	}
    50  
    51  	server.Main(nil, modules, func(srv *server.Server) error {
    52  		// Build a tsmon state with a global target, to make different processes
    53  		// report to metrics into the same target. Processes need to cooperate with
    54  		// one another to avoid conflicts. We do it by relying on GAE cron overrun
    55  		// protection (it won't launch a cron invocation if the previous one is
    56  		// still running).
    57  
    58  		var mon monitor.Monitor
    59  		// Figure out where to flush metrics.
    60  		switch {
    61  		case srv.Options.Prod && srv.Options.TsMonAccount != "":
    62  			var err error
    63  			mon, err = tsmonsrv.NewProdXMonitor(srv.Context, 4096, srv.Options.TsMonAccount)
    64  			if err != nil {
    65  				return err
    66  			}
    67  		case !srv.Options.Prod:
    68  			mon = monitor.NewDebugMonitor("")
    69  		default:
    70  			mon = monitor.NewNilMonitor()
    71  		}
    72  
    73  		registerMetricsCron(srv, mon, "report-bots", reportBots)
    74  		registerMetricsCron(srv, mon, "report-tasks", reportTasks)
    75  		return nil
    76  	})
    77  }
    78  
    79  func registerMetricsCron(srv *server.Server, mon monitor.Monitor, id string, report func(ctx context.Context, state *tsmon.State, serviceName string) error) {
    80  	state := tsmon.NewState()
    81  	state.SetStore(store.NewInMemory(&target.Task{
    82  		DataCenter:  "appengine",
    83  		ServiceName: srv.Options.TsMonServiceName,
    84  		JobName:     srv.Options.TsMonJobName,
    85  		HostName:    "global",
    86  	}))
    87  	state.InhibitGlobalCallbacksOnFlush()
    88  	state.SetMonitor(mon)
    89  
    90  	cron.RegisterHandler(id, func(ctx context.Context) error {
    91  		return report(ctx, state, srv.Options.TsMonServiceName)
    92  	})
    93  }
    94  
    95  ////////////////////////////////////////////////////////////////////////////////
    96  
    97  var (
    98  	botsPerState = metric.NewInt("swarming/rbe_migration/bots",
    99  		"Number of Swarming bots per RBE migration state.",
   100  		nil,
   101  		field.String("pool"),  // e.g "luci.infra.ci"
   102  		field.String("state"), // e.g. "RBE", "SWARMING", "HYBRID"
   103  	)
   104  	botsStatus         = metric.NewString("executors/status", "Status of a job executor.", nil)
   105  	botsDimensionsPool = metric.NewString("executors/pool", "Pool name for a given job executor.", nil)
   106  	botsRBEInstance    = metric.NewString("executors/rbe", "RBE instance of a job executor.", nil)
   107  	jobsActives        = metric.NewInt("jobs/active",
   108  		"Number of running, pending or otherwise active jobs.",
   109  		nil,
   110  		field.String("spec_name"),     // name of a job specification.
   111  		field.String("project_id"),    // e.g. "chromium".
   112  		field.String("subproject_id"), // e.g. "blink". Set to empty string if not used.
   113  		field.String("pool"),          // e.g. "Chrome".
   114  		field.String("rbe"),           // RBE instance of the task or literal "none".
   115  		field.String("status"),        // "pending", or "running".
   116  	)
   117  )
   118  
   119  //   - android_devices is a side effect of the health of each Android devices
   120  //     connected to the bot.
   121  //   - caches has an unbounded matrix.
   122  //   - server_version is the current server version. It'd be good to have but the
   123  //     current monitoring pipeline is not adapted for this.
   124  //   - id is unique for each bot.
   125  //   - temp_band is android specific.
   126  //
   127  // Keep in sync with luci/appengine/swarming/ts_mon_metrics.py.
   128  var ignoredDimensions = stringset.NewFromSlice(
   129  	"android_devices",
   130  	"caches",
   131  	"id",
   132  	"server_version",
   133  	"temp_band",
   134  )
   135  
   136  func reportBots(ctx context.Context, state *tsmon.State, serviceName string) error {
   137  	const shardCount = 128
   138  
   139  	startTS := clock.Now(ctx)
   140  
   141  	shards := make([]*shardState, shardCount)
   142  	for i := range shards {
   143  		shards[i] = newShardState()
   144  	}
   145  
   146  	mctx := tsmon.WithState(ctx, state)
   147  	defer cleanUpBots(mctx, state)
   148  
   149  	err := dsmapperlite.Map(ctx, model.BotInfoQuery(), shardCount, 1000,
   150  		func(ctx context.Context, shardIdx int, bot *model.BotInfo) error {
   151  			// These appear to be phantom GCE provider bots which are either being created
   152  			// or weren't fully deleted. They don't have `state` JSON dict populated, and
   153  			// they aren't really running.
   154  			if !bot.LastSeen.IsSet() || len(bot.State) == 0 {
   155  				return nil
   156  			}
   157  			shards[shardIdx].collect(ctx, bot)
   158  			setExecutorMetrics(mctx, bot, serviceName)
   159  			return nil
   160  		},
   161  	)
   162  	if err != nil {
   163  		return errors.Annotate(err, "when visiting BotInfo").Err()
   164  	}
   165  
   166  	// Merge all shards into a single set of counters.
   167  	total := newShardState()
   168  	for _, shard := range shards {
   169  		total.mergeFrom(shard)
   170  	}
   171  	logging.Infof(ctx, "Scan done in %s. Total visited bots: %d", clock.Since(ctx, startTS), total.total)
   172  
   173  	// Flush them to tsmon. Do not retain in memory after that.
   174  	flushTS := clock.Now(ctx)
   175  	for key, val := range total.counts {
   176  		botsPerState.Set(mctx, val, key.pool, key.state)
   177  	}
   178  
   179  	// Note: use `ctx` here (not `mctx`) to report monitor's gRPC stats into
   180  	// the regular process-global tsmon state.
   181  	if err := state.ParallelFlush(ctx, nil, 32); err != nil {
   182  		return errors.Annotate(err, "failed to flush values to monitoring").Err()
   183  	}
   184  	logging.Infof(ctx, "Flushed to monitoring in %s.", clock.Since(ctx, flushTS))
   185  	return nil
   186  }
   187  
   188  type counterKey struct {
   189  	pool  string // e.g. "luci.infra.ci"
   190  	state string // e.g. "SWARMING"
   191  }
   192  
   193  type shardState struct {
   194  	counts map[counterKey]int64
   195  	total  int64
   196  }
   197  
   198  func newShardState() *shardState {
   199  	return &shardState{
   200  		counts: map[counterKey]int64{},
   201  	}
   202  }
   203  
   204  func (s *shardState) collect(ctx context.Context, bot *model.BotInfo) {
   205  	migrationState := "UNKNOWN"
   206  
   207  	if bot.Quarantined {
   208  		migrationState = "QUARANTINED"
   209  	} else if bot.IsInMaintenance() {
   210  		migrationState = "MAINTENANCE"
   211  	} else {
   212  		var botState struct {
   213  			Handshaking   bool   `json:"handshaking,omitempty"`
   214  			RBEInstance   string `json:"rbe_instance,omitempty"`
   215  			RBEHybridMode bool   `json:"rbe_hybrid_mode,omitempty"`
   216  		}
   217  		if err := json.Unmarshal(bot.State, &botState); err == nil {
   218  			switch {
   219  			case botState.Handshaking:
   220  				// This is not a fully connected bot.
   221  				return
   222  			case botState.RBEInstance == "":
   223  				migrationState = "SWARMING"
   224  			case botState.RBEHybridMode:
   225  				migrationState = "HYBRID"
   226  			case !botState.RBEHybridMode:
   227  				migrationState = "RBE"
   228  			}
   229  		} else {
   230  			logging.Warningf(ctx, "Bot %s: bad state:\n:%s", bot.BotID(), bot.State)
   231  		}
   232  	}
   233  
   234  	if bot.IsDead() {
   235  		migrationState = "DEAD_" + migrationState
   236  	}
   237  
   238  	pools := bot.DimenionsByKey("pool")
   239  	if len(pools) == 0 {
   240  		pools = []string{"unknown"}
   241  	}
   242  	for _, pool := range pools {
   243  		s.counts[counterKey{pool, migrationState}] += 1
   244  	}
   245  	s.total += 1
   246  }
   247  
   248  func (s *shardState) mergeFrom(another *shardState) {
   249  	for key, count := range another.counts {
   250  		s.counts[key] += count
   251  	}
   252  	s.total += another.total
   253  }
   254  
   255  // setExecutorMetrics sets the executors metrics.
   256  func setExecutorMetrics(mctx context.Context, bot *model.BotInfo, serviceName string) {
   257  	// HostName needs to be set per bot. Cannot use global target.
   258  	tctx := target.Set(mctx, &target.Task{
   259  		DataCenter:  "appengine",
   260  		ServiceName: serviceName,
   261  		HostName:    fmt.Sprintf("autogen:%s", bot.BotID()),
   262  	})
   263  	// Status.
   264  	status := bot.GetStatus()
   265  	botsStatus.Set(tctx, status)
   266  	// DimensionsPool.
   267  	dims := poolFromDimensions(bot.Dimensions)
   268  	botsDimensionsPool.Set(tctx, dims)
   269  	// RBEInstance.
   270  	rbeState := "none"
   271  	var botState struct {
   272  		RBEInstance string `json:"rbe_instance,omitempty"`
   273  	}
   274  	if err := json.Unmarshal(bot.State, &botState); err == nil {
   275  		if botState.RBEInstance != "" {
   276  			rbeState = botState.RBEInstance
   277  		}
   278  	} else {
   279  		logging.Warningf(mctx, "Bot %s: bad state:\n:%s", bot.BotID(), bot.State)
   280  	}
   281  	botsRBEInstance.Set(tctx, rbeState)
   282  }
   283  
   284  // poolFromDimensions serializes the bot's dimensions and trims out redundant prefixes.
   285  // i.e. ["cpu:x86-64", "cpu:x86-64-Broadwell_GCE"] returns "cpu:x86-64-Broadwell_GCE".
   286  func poolFromDimensions(dimensions []string) string {
   287  	// Assuming dimensions are sorted.
   288  	var pairs []string
   289  
   290  	for current := 0; current < len(dimensions); current++ {
   291  		key := strings.SplitN(dimensions[current], ":", 2)[0]
   292  		if ignoredDimensions.Has(key) {
   293  			continue
   294  		}
   295  		next := current + 1
   296  		// Set `current` to the longest (and last) prefix of the chain.
   297  		// i.e. if chain is ["os:Ubuntu", "os:Ubuntu-22", "os:Ubuntu-22.04"]
   298  		// dimensions[current] is "os:Ubuntu-22.04"
   299  		for next < len(dimensions) && strings.HasPrefix(dimensions[next], dimensions[current]) {
   300  			current++
   301  			next++
   302  		}
   303  		pairs = append(pairs, dimensions[current])
   304  	}
   305  	return strings.Join(pairs, "|")
   306  }
   307  
   308  func cleanUpBots(mctx context.Context, state *tsmon.State) {
   309  	state.Store().Reset(mctx, botsPerState)
   310  	state.Store().Reset(mctx, botsStatus)
   311  	state.Store().Reset(mctx, botsDimensionsPool)
   312  	state.Store().Reset(mctx, botsRBEInstance)
   313  }
   314  
   315  func cleanUpTasks(mctx context.Context, state *tsmon.State) {
   316  	state.Store().Reset(mctx, jobsActives)
   317  }
   318  
   319  type taskCounterKey struct {
   320  	specName     string // name of a job specification.
   321  	projectID    string // e.g. "chromium".
   322  	subprojectID string // e.g. "blink". Set to empty string if not used.
   323  	pool         string // e.g. "Chrome".
   324  	rbe          string // RBE instance of the task or literal "none".
   325  	status       string // "pending", or "running".
   326  }
   327  
   328  type taskResult struct {
   329  	counts map[taskCounterKey]int64
   330  	total  int64
   331  }
   332  
   333  func newTaskResult() *taskResult {
   334  	return &taskResult{
   335  		counts: map[taskCounterKey]int64{},
   336  	}
   337  }
   338  
   339  func tagListToMap(tags []string) (tagsMap map[string]string) {
   340  	tagsMap = make(map[string]string, len(tags))
   341  	for _, tag := range tags {
   342  		key, val, _ := strings.Cut(tag, ":")
   343  		tagsMap[key] = val
   344  	}
   345  	return tagsMap
   346  }
   347  
   348  func getSpecName(tagsMap map[string]string) string {
   349  	if s := tagsMap["spec_name"]; s != "" {
   350  		return s
   351  	}
   352  	b := tagsMap["buildername"]
   353  	if e := tagsMap["build_is_experimental"]; e == "true" {
   354  		b += ":experimental"
   355  	}
   356  	if b == "" {
   357  		if t := tagsMap["terminate"]; t == "1" {
   358  			return "swarming:terminate"
   359  		}
   360  	}
   361  	return b
   362  }
   363  
   364  func getTaskResultSummaryStatus(tsr *model.TaskResultSummary) (status string) {
   365  	switch tsr.TaskResultCommon.State {
   366  	case apipb.TaskState_RUNNING:
   367  		status = "running"
   368  	case apipb.TaskState_PENDING:
   369  		status = "pending"
   370  	default:
   371  		status = ""
   372  	}
   373  	return status
   374  }
   375  
   376  func (s *taskResult) collect(ctx context.Context, tsr *model.TaskResultSummary) {
   377  	tagsMap := tagListToMap(tsr.Tags)
   378  	key := taskCounterKey{
   379  		specName:     getSpecName(tagsMap),
   380  		projectID:    tagsMap["project"],
   381  		subprojectID: tagsMap["subproject"],
   382  		pool:         tagsMap["pool"],
   383  		rbe:          tagsMap["rbe"],
   384  		status:       getTaskResultSummaryStatus(tsr),
   385  	}
   386  	if key.rbe == "" {
   387  		key.rbe = "none"
   388  	}
   389  	s.counts[key] += 1
   390  	s.total += 1
   391  }
   392  
   393  func reportTasks(ctx context.Context, state *tsmon.State, serviceName string) error {
   394  	startTS := clock.Now(ctx)
   395  
   396  	total := newTaskResult()
   397  	mctx := tsmon.WithState(ctx, state)
   398  	defer cleanUpTasks(mctx, state)
   399  
   400  	q := model.TaskResultSummaryQuery().Lte("state", apipb.TaskState_PENDING).Gte("state", apipb.TaskState_RUNNING)
   401  	err := datastore.RunBatch(ctx, 1000, q,
   402  		func(trs *model.TaskResultSummary) error {
   403  			total.collect(ctx, trs)
   404  			return nil
   405  		},
   406  	)
   407  	if err != nil {
   408  		return errors.Annotate(err, "when visiting TaskResultSummary").Err()
   409  	}
   410  
   411  	logging.Infof(ctx, "Scan done in %s. Total visited Tasks: %d. Number of types of tasks: %d", clock.Since(ctx, startTS), total.total, len(total.counts))
   412  
   413  	// Flush them to tsmon. Do not retain in memory after that.
   414  	flushTS := clock.Now(ctx)
   415  	for key, val := range total.counts {
   416  		jobsActives.Set(mctx, val, key.specName, key.projectID, key.subprojectID, key.pool, key.rbe, key.status)
   417  	}
   418  
   419  	// Note: use `ctx` here (not `mctx`) to report monitor's gRPC stats into
   420  	// the regular process-global tsmon state.
   421  	if err := state.ParallelFlush(ctx, nil, 32); err != nil {
   422  		return errors.Annotate(err, "failed to flush values to monitoring").Err()
   423  	}
   424  	logging.Infof(ctx, "Flushed to monitoring in %s.", clock.Since(ctx, flushTS))
   425  	return nil
   426  }