go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/swarming/server/monitor/main.go (about) 1 // Copyright 2023 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package main 16 17 import ( 18 "context" 19 "encoding/json" 20 "fmt" 21 "strings" 22 23 "go.chromium.org/luci/common/clock" 24 "go.chromium.org/luci/common/data/stringset" 25 "go.chromium.org/luci/common/errors" 26 "go.chromium.org/luci/common/logging" 27 "go.chromium.org/luci/common/tsmon" 28 "go.chromium.org/luci/common/tsmon/field" 29 "go.chromium.org/luci/common/tsmon/metric" 30 "go.chromium.org/luci/common/tsmon/monitor" 31 "go.chromium.org/luci/common/tsmon/store" 32 "go.chromium.org/luci/common/tsmon/target" 33 "go.chromium.org/luci/gae/service/datastore" 34 "go.chromium.org/luci/server" 35 "go.chromium.org/luci/server/cron" 36 "go.chromium.org/luci/server/dsmapper/dsmapperlite" 37 "go.chromium.org/luci/server/gaeemulation" 38 "go.chromium.org/luci/server/module" 39 tsmonsrv "go.chromium.org/luci/server/tsmon" 40 41 apipb "go.chromium.org/luci/swarming/proto/api_v2" 42 "go.chromium.org/luci/swarming/server/model" 43 ) 44 45 func main() { 46 modules := []module.Module{ 47 cron.NewModuleFromFlags(), 48 gaeemulation.NewModuleFromFlags(), 49 } 50 51 server.Main(nil, modules, func(srv *server.Server) error { 52 // Build a tsmon state with a global target, to make different processes 53 // report to metrics into the same target. Processes need to cooperate with 54 // one another to avoid conflicts. We do it by relying on GAE cron overrun 55 // protection (it won't launch a cron invocation if the previous one is 56 // still running). 57 58 var mon monitor.Monitor 59 // Figure out where to flush metrics. 60 switch { 61 case srv.Options.Prod && srv.Options.TsMonAccount != "": 62 var err error 63 mon, err = tsmonsrv.NewProdXMonitor(srv.Context, 4096, srv.Options.TsMonAccount) 64 if err != nil { 65 return err 66 } 67 case !srv.Options.Prod: 68 mon = monitor.NewDebugMonitor("") 69 default: 70 mon = monitor.NewNilMonitor() 71 } 72 73 registerMetricsCron(srv, mon, "report-bots", reportBots) 74 registerMetricsCron(srv, mon, "report-tasks", reportTasks) 75 return nil 76 }) 77 } 78 79 func registerMetricsCron(srv *server.Server, mon monitor.Monitor, id string, report func(ctx context.Context, state *tsmon.State, serviceName string) error) { 80 state := tsmon.NewState() 81 state.SetStore(store.NewInMemory(&target.Task{ 82 DataCenter: "appengine", 83 ServiceName: srv.Options.TsMonServiceName, 84 JobName: srv.Options.TsMonJobName, 85 HostName: "global", 86 })) 87 state.InhibitGlobalCallbacksOnFlush() 88 state.SetMonitor(mon) 89 90 cron.RegisterHandler(id, func(ctx context.Context) error { 91 return report(ctx, state, srv.Options.TsMonServiceName) 92 }) 93 } 94 95 //////////////////////////////////////////////////////////////////////////////// 96 97 var ( 98 botsPerState = metric.NewInt("swarming/rbe_migration/bots", 99 "Number of Swarming bots per RBE migration state.", 100 nil, 101 field.String("pool"), // e.g "luci.infra.ci" 102 field.String("state"), // e.g. "RBE", "SWARMING", "HYBRID" 103 ) 104 botsStatus = metric.NewString("executors/status", "Status of a job executor.", nil) 105 botsDimensionsPool = metric.NewString("executors/pool", "Pool name for a given job executor.", nil) 106 botsRBEInstance = metric.NewString("executors/rbe", "RBE instance of a job executor.", nil) 107 jobsActives = metric.NewInt("jobs/active", 108 "Number of running, pending or otherwise active jobs.", 109 nil, 110 field.String("spec_name"), // name of a job specification. 111 field.String("project_id"), // e.g. "chromium". 112 field.String("subproject_id"), // e.g. "blink". Set to empty string if not used. 113 field.String("pool"), // e.g. "Chrome". 114 field.String("rbe"), // RBE instance of the task or literal "none". 115 field.String("status"), // "pending", or "running". 116 ) 117 ) 118 119 // - android_devices is a side effect of the health of each Android devices 120 // connected to the bot. 121 // - caches has an unbounded matrix. 122 // - server_version is the current server version. It'd be good to have but the 123 // current monitoring pipeline is not adapted for this. 124 // - id is unique for each bot. 125 // - temp_band is android specific. 126 // 127 // Keep in sync with luci/appengine/swarming/ts_mon_metrics.py. 128 var ignoredDimensions = stringset.NewFromSlice( 129 "android_devices", 130 "caches", 131 "id", 132 "server_version", 133 "temp_band", 134 ) 135 136 func reportBots(ctx context.Context, state *tsmon.State, serviceName string) error { 137 const shardCount = 128 138 139 startTS := clock.Now(ctx) 140 141 shards := make([]*shardState, shardCount) 142 for i := range shards { 143 shards[i] = newShardState() 144 } 145 146 mctx := tsmon.WithState(ctx, state) 147 defer cleanUpBots(mctx, state) 148 149 err := dsmapperlite.Map(ctx, model.BotInfoQuery(), shardCount, 1000, 150 func(ctx context.Context, shardIdx int, bot *model.BotInfo) error { 151 // These appear to be phantom GCE provider bots which are either being created 152 // or weren't fully deleted. They don't have `state` JSON dict populated, and 153 // they aren't really running. 154 if !bot.LastSeen.IsSet() || len(bot.State) == 0 { 155 return nil 156 } 157 shards[shardIdx].collect(ctx, bot) 158 setExecutorMetrics(mctx, bot, serviceName) 159 return nil 160 }, 161 ) 162 if err != nil { 163 return errors.Annotate(err, "when visiting BotInfo").Err() 164 } 165 166 // Merge all shards into a single set of counters. 167 total := newShardState() 168 for _, shard := range shards { 169 total.mergeFrom(shard) 170 } 171 logging.Infof(ctx, "Scan done in %s. Total visited bots: %d", clock.Since(ctx, startTS), total.total) 172 173 // Flush them to tsmon. Do not retain in memory after that. 174 flushTS := clock.Now(ctx) 175 for key, val := range total.counts { 176 botsPerState.Set(mctx, val, key.pool, key.state) 177 } 178 179 // Note: use `ctx` here (not `mctx`) to report monitor's gRPC stats into 180 // the regular process-global tsmon state. 181 if err := state.ParallelFlush(ctx, nil, 32); err != nil { 182 return errors.Annotate(err, "failed to flush values to monitoring").Err() 183 } 184 logging.Infof(ctx, "Flushed to monitoring in %s.", clock.Since(ctx, flushTS)) 185 return nil 186 } 187 188 type counterKey struct { 189 pool string // e.g. "luci.infra.ci" 190 state string // e.g. "SWARMING" 191 } 192 193 type shardState struct { 194 counts map[counterKey]int64 195 total int64 196 } 197 198 func newShardState() *shardState { 199 return &shardState{ 200 counts: map[counterKey]int64{}, 201 } 202 } 203 204 func (s *shardState) collect(ctx context.Context, bot *model.BotInfo) { 205 migrationState := "UNKNOWN" 206 207 if bot.Quarantined { 208 migrationState = "QUARANTINED" 209 } else if bot.IsInMaintenance() { 210 migrationState = "MAINTENANCE" 211 } else { 212 var botState struct { 213 Handshaking bool `json:"handshaking,omitempty"` 214 RBEInstance string `json:"rbe_instance,omitempty"` 215 RBEHybridMode bool `json:"rbe_hybrid_mode,omitempty"` 216 } 217 if err := json.Unmarshal(bot.State, &botState); err == nil { 218 switch { 219 case botState.Handshaking: 220 // This is not a fully connected bot. 221 return 222 case botState.RBEInstance == "": 223 migrationState = "SWARMING" 224 case botState.RBEHybridMode: 225 migrationState = "HYBRID" 226 case !botState.RBEHybridMode: 227 migrationState = "RBE" 228 } 229 } else { 230 logging.Warningf(ctx, "Bot %s: bad state:\n:%s", bot.BotID(), bot.State) 231 } 232 } 233 234 if bot.IsDead() { 235 migrationState = "DEAD_" + migrationState 236 } 237 238 pools := bot.DimenionsByKey("pool") 239 if len(pools) == 0 { 240 pools = []string{"unknown"} 241 } 242 for _, pool := range pools { 243 s.counts[counterKey{pool, migrationState}] += 1 244 } 245 s.total += 1 246 } 247 248 func (s *shardState) mergeFrom(another *shardState) { 249 for key, count := range another.counts { 250 s.counts[key] += count 251 } 252 s.total += another.total 253 } 254 255 // setExecutorMetrics sets the executors metrics. 256 func setExecutorMetrics(mctx context.Context, bot *model.BotInfo, serviceName string) { 257 // HostName needs to be set per bot. Cannot use global target. 258 tctx := target.Set(mctx, &target.Task{ 259 DataCenter: "appengine", 260 ServiceName: serviceName, 261 HostName: fmt.Sprintf("autogen:%s", bot.BotID()), 262 }) 263 // Status. 264 status := bot.GetStatus() 265 botsStatus.Set(tctx, status) 266 // DimensionsPool. 267 dims := poolFromDimensions(bot.Dimensions) 268 botsDimensionsPool.Set(tctx, dims) 269 // RBEInstance. 270 rbeState := "none" 271 var botState struct { 272 RBEInstance string `json:"rbe_instance,omitempty"` 273 } 274 if err := json.Unmarshal(bot.State, &botState); err == nil { 275 if botState.RBEInstance != "" { 276 rbeState = botState.RBEInstance 277 } 278 } else { 279 logging.Warningf(mctx, "Bot %s: bad state:\n:%s", bot.BotID(), bot.State) 280 } 281 botsRBEInstance.Set(tctx, rbeState) 282 } 283 284 // poolFromDimensions serializes the bot's dimensions and trims out redundant prefixes. 285 // i.e. ["cpu:x86-64", "cpu:x86-64-Broadwell_GCE"] returns "cpu:x86-64-Broadwell_GCE". 286 func poolFromDimensions(dimensions []string) string { 287 // Assuming dimensions are sorted. 288 var pairs []string 289 290 for current := 0; current < len(dimensions); current++ { 291 key := strings.SplitN(dimensions[current], ":", 2)[0] 292 if ignoredDimensions.Has(key) { 293 continue 294 } 295 next := current + 1 296 // Set `current` to the longest (and last) prefix of the chain. 297 // i.e. if chain is ["os:Ubuntu", "os:Ubuntu-22", "os:Ubuntu-22.04"] 298 // dimensions[current] is "os:Ubuntu-22.04" 299 for next < len(dimensions) && strings.HasPrefix(dimensions[next], dimensions[current]) { 300 current++ 301 next++ 302 } 303 pairs = append(pairs, dimensions[current]) 304 } 305 return strings.Join(pairs, "|") 306 } 307 308 func cleanUpBots(mctx context.Context, state *tsmon.State) { 309 state.Store().Reset(mctx, botsPerState) 310 state.Store().Reset(mctx, botsStatus) 311 state.Store().Reset(mctx, botsDimensionsPool) 312 state.Store().Reset(mctx, botsRBEInstance) 313 } 314 315 func cleanUpTasks(mctx context.Context, state *tsmon.State) { 316 state.Store().Reset(mctx, jobsActives) 317 } 318 319 type taskCounterKey struct { 320 specName string // name of a job specification. 321 projectID string // e.g. "chromium". 322 subprojectID string // e.g. "blink". Set to empty string if not used. 323 pool string // e.g. "Chrome". 324 rbe string // RBE instance of the task or literal "none". 325 status string // "pending", or "running". 326 } 327 328 type taskResult struct { 329 counts map[taskCounterKey]int64 330 total int64 331 } 332 333 func newTaskResult() *taskResult { 334 return &taskResult{ 335 counts: map[taskCounterKey]int64{}, 336 } 337 } 338 339 func tagListToMap(tags []string) (tagsMap map[string]string) { 340 tagsMap = make(map[string]string, len(tags)) 341 for _, tag := range tags { 342 key, val, _ := strings.Cut(tag, ":") 343 tagsMap[key] = val 344 } 345 return tagsMap 346 } 347 348 func getSpecName(tagsMap map[string]string) string { 349 if s := tagsMap["spec_name"]; s != "" { 350 return s 351 } 352 b := tagsMap["buildername"] 353 if e := tagsMap["build_is_experimental"]; e == "true" { 354 b += ":experimental" 355 } 356 if b == "" { 357 if t := tagsMap["terminate"]; t == "1" { 358 return "swarming:terminate" 359 } 360 } 361 return b 362 } 363 364 func getTaskResultSummaryStatus(tsr *model.TaskResultSummary) (status string) { 365 switch tsr.TaskResultCommon.State { 366 case apipb.TaskState_RUNNING: 367 status = "running" 368 case apipb.TaskState_PENDING: 369 status = "pending" 370 default: 371 status = "" 372 } 373 return status 374 } 375 376 func (s *taskResult) collect(ctx context.Context, tsr *model.TaskResultSummary) { 377 tagsMap := tagListToMap(tsr.Tags) 378 key := taskCounterKey{ 379 specName: getSpecName(tagsMap), 380 projectID: tagsMap["project"], 381 subprojectID: tagsMap["subproject"], 382 pool: tagsMap["pool"], 383 rbe: tagsMap["rbe"], 384 status: getTaskResultSummaryStatus(tsr), 385 } 386 if key.rbe == "" { 387 key.rbe = "none" 388 } 389 s.counts[key] += 1 390 s.total += 1 391 } 392 393 func reportTasks(ctx context.Context, state *tsmon.State, serviceName string) error { 394 startTS := clock.Now(ctx) 395 396 total := newTaskResult() 397 mctx := tsmon.WithState(ctx, state) 398 defer cleanUpTasks(mctx, state) 399 400 q := model.TaskResultSummaryQuery().Lte("state", apipb.TaskState_PENDING).Gte("state", apipb.TaskState_RUNNING) 401 err := datastore.RunBatch(ctx, 1000, q, 402 func(trs *model.TaskResultSummary) error { 403 total.collect(ctx, trs) 404 return nil 405 }, 406 ) 407 if err != nil { 408 return errors.Annotate(err, "when visiting TaskResultSummary").Err() 409 } 410 411 logging.Infof(ctx, "Scan done in %s. Total visited Tasks: %d. Number of types of tasks: %d", clock.Since(ctx, startTS), total.total, len(total.counts)) 412 413 // Flush them to tsmon. Do not retain in memory after that. 414 flushTS := clock.Now(ctx) 415 for key, val := range total.counts { 416 jobsActives.Set(mctx, val, key.specName, key.projectID, key.subprojectID, key.pool, key.rbe, key.status) 417 } 418 419 // Note: use `ctx` here (not `mctx`) to report monitor's gRPC stats into 420 // the regular process-global tsmon state. 421 if err := state.ParallelFlush(ctx, nil, 32); err != nil { 422 return errors.Annotate(err, "failed to flush values to monitoring").Err() 423 } 424 logging.Infof(ctx, "Flushed to monitoring in %s.", clock.Since(ctx, flushTS)) 425 return nil 426 }