go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/swarming/server/model/botinfo.go (about) 1 // Copyright 2023 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package model 16 17 import ( 18 "context" 19 "strings" 20 "time" 21 22 "google.golang.org/protobuf/types/known/structpb" 23 "google.golang.org/protobuf/types/known/timestamppb" 24 25 "go.chromium.org/luci/auth/identity" 26 "go.chromium.org/luci/gae/service/datastore" 27 28 apipb "go.chromium.org/luci/swarming/proto/api_v2" 29 ) 30 31 // BotEventType identifies various known bot events. 32 type BotEventType string 33 34 // Bot events that happen outside the scope of a task. 35 const ( 36 BotEventConnected BotEventType = "bot_connected" 37 BotEventError BotEventType = "bot_error" 38 BotEventIdle BotEventType = "bot_idle" 39 BotEventLog BotEventType = "bot_log" 40 BotEventMissing BotEventType = "bot_missing" 41 BotEventPolling BotEventType = "bot_polling" 42 BotEventRebooting BotEventType = "bot_rebooting" 43 BotEventShutdown BotEventType = "bot_shutdown" 44 BotEventTerminate BotEventType = "bot_terminate" 45 ) 46 47 // Bot events representing polling outcomes. 48 const ( 49 BotEventRestart BotEventType = "request_restart" 50 BotEventSleep BotEventType = "request_sleep" 51 BotEventTask BotEventType = "request_task" 52 BotEventUpdate BotEventType = "request_update" 53 ) 54 55 // Bot events related to running tasks. 56 const ( 57 BotEventTaskCompleted BotEventType = "task_completed" 58 BotEventTaskError BotEventType = "task_error" 59 BotEventTaskKilled BotEventType = "task_killed" 60 BotEventTaskUpdate BotEventType = "task_update" 61 ) 62 63 // BotStateEnum is used to represent state of the bot in datastore. 64 // 65 // See comment for BotCommon.Composite. Individual values should not leak in any 66 // public APIs, it is an implementation detail. 67 type BotStateEnum int64 68 69 // Possible categories of bot state. 70 const ( 71 BotStateBusy BotStateEnum = 1 << 0 72 BotStateIdle BotStateEnum = 1 << 1 73 BotStateQuarantined BotStateEnum = 1 << 2 74 BotStateHealthy BotStateEnum = 1 << 3 75 BotStateUnused1 BotStateEnum = 1 << 4 76 BotStateUnused2 BotStateEnum = 1 << 5 77 BotStateDead BotStateEnum = 1 << 6 78 BotStateAlive BotStateEnum = 1 << 7 79 BotStateInMaintenance BotStateEnum = 1 << 8 80 BotStateNotInMaintenance BotStateEnum = 1 << 9 81 ) 82 83 // StateFilter represents a filter over the possible bot states. 84 // 85 // Each field is a filter on one aspect of the bot state with possible values 86 // being TRUE (meaning "yes"), FALSE (meaning "no") and NULL (meaning "don't 87 // care"). 88 type StateFilter struct { 89 // Quarantined filters bots based on whether they are quarantined. 90 Quarantined apipb.NullableBool 91 // InMaintenance filters bots based on whether they are in maintenance mode. 92 InMaintenance apipb.NullableBool 93 // IsDead filters bots based on whether they are connected or not. 94 IsDead apipb.NullableBool 95 // IsBusy filters bots based on whether they execute any task or not. 96 IsBusy apipb.NullableBool 97 } 98 99 // BotRoot is an entity group root of entities representing a single bot. 100 // 101 // Presence of this entity indicates there are BotEvent entities for this bot. 102 // 103 // TODO(vadimsh): This entity is unnecessary complication. Old entities cleanup 104 // should happen via Cloud Datastore TTL feature, then this entity is not 105 // needed. 106 type BotRoot struct { 107 // Extra are entity properties that didn't match any declared ones below. 108 // 109 // Should normally be empty. 110 Extra datastore.PropertyMap `gae:"-,extra"` 111 112 // Key is derived based on the bot ID, see BotRootKey. 113 Key *datastore.Key `gae:"$key"` 114 115 // LegacyCurrent is no longer used. 116 LegacyCurrent LegacyProperty `gae:"current"` 117 } 118 119 // BotRootKey is a root key of an entity group with info about a bot. 120 func BotRootKey(ctx context.Context, botID string) *datastore.Key { 121 return datastore.NewKey(ctx, "BotRoot", botID, 0, nil) 122 } 123 124 // BotCommon contains properties that are common to both BotInfo and BotEvent. 125 // 126 // It is not meant to be stored in the datastore on its own, only as an embedded 127 // struct inside BotInfo or BotEvent. 128 type BotCommon struct { 129 // State is a free form JSON dict with the bot state as reported by the bot. 130 // 131 // Swarming itself mostly ignores this information, but it is exposed via API 132 // and UI, allowing bots to report extended information about themselves to 133 // Swarming clients. 134 State []byte `gae:"state,noindex"` 135 136 // ExternalIP is the bot's IP address as seen by the server. 137 ExternalIP string `gae:"external_ip,noindex"` 138 139 // AuthenticatedAs is the bot's credentials as seen by the server. 140 AuthenticatedAs identity.Identity `gae:"authenticated_as,noindex"` 141 142 // Version of the bot code the bot is running. 143 Version string `gae:"version,noindex"` 144 145 // Quarantined means the bot is unhealthy and should not receive tasks. 146 // 147 // It is set when either: 148 // - dimensions['quarantined'] or state['quarantined'] is set by the bot. 149 // - API requests from the bot appear to be malformed. 150 Quarantined bool `gae:"quarantined,noindex"` 151 152 // Maintenance message if the bot is in maintenance. 153 // 154 // Maintenance state, just like quarantined state, means the bot should not 155 // receive tasks. The difference is that maintenance is an expected condition: 156 // - The bot moves into maintenance state in expected moments. 157 // - It is expected to be short and end automatically. 158 Maintenance string `gae:"maintenance_msg,noindex"` 159 160 // TaskID is the packed TaskRunResult key of the relevant task, if any. 161 // 162 // For BotInfo, it identifies the current TaskRunResult being executed by 163 // the bot. 164 // 165 // For BotEvent, it is relevant for event types `request_task`, `task_killed`, 166 // `task_completed`, `task_error`. 167 // 168 // Note that it is **not** a packed TaskResultSummary. This `task_id` ends in 169 // `1` instead of `0`. 170 // 171 // TODO(vadimsh): This is unfortunate, since this field ends up in BQ exports 172 // where it causes confusion: task IDs in other BQ exports are "packed 173 // TaskResultSummary ID", i.e. end in 0. This complicates joining BQ tables. 174 TaskID string `gae:"task_id,noindex"` 175 176 // LastSeen is the last time the bot contacted the server, if ever. 177 // 178 // Note that it is unindexed to avoid hotspotting the datastore, see 179 // https://chromium.googlesource.com/infra/luci/luci-py/+/4e9aecba 180 LastSeen datastore.Optional[time.Time, datastore.Unindexed] `gae:"last_seen_ts"` 181 182 // IdleSince is when the bot became idle last time, if ever. 183 // 184 // It is unset when running the task or hooks. 185 IdleSince datastore.Optional[time.Time, datastore.Unindexed] `gae:"idle_since_ts"` 186 187 // LegacyProperties is no longer used. 188 LegacyLeaseID LegacyProperty `gae:"lease_id"` 189 190 // LegacyLeaseExpiration is no longer used. 191 LegacyLeaseExpiration LegacyProperty `gae:"lease_expiration_ts"` 192 193 // LegacyLeasedIndefinitely is no longer used. 194 LegacyLeasedIndefinitely LegacyProperty `gae:"leased_indefinitely"` 195 196 // LegacyMachineType is no longer used. 197 LegacyMachineType LegacyProperty `gae:"machine_type"` 198 199 // LegacyMachineLease is no longer used. 200 LegacyMachineLease LegacyProperty `gae:"machine_lease"` 201 202 // LegacyStateJSON is no longer used. 203 LegacyStateJSON LegacyProperty `gae:"state_json"` 204 205 // LegacyDimensions is no longer used. 206 LegacyDimensions LegacyProperty `gae:"dimensions"` 207 208 // LegacyIsBusy is no longer used. 209 LegacyIsBusy LegacyProperty `gae:"is_busy"` 210 } 211 212 // BotInfo contains the latest information about a bot. 213 type BotInfo struct { 214 BotCommon 215 216 // Extra are entity properties that didn't match any declared ones below. 217 // 218 // Should normally be empty. 219 Extra datastore.PropertyMap `gae:"-,extra"` 220 221 // Key is derived based on the bot ID, see BotInfoKey. 222 Key *datastore.Key `gae:"$key"` 223 224 // Dimensions is a list of dimensions reported by the bot. 225 // 226 // Dimensions are used for task selection. They are encoded as a sorted list 227 // of `key:value` strings. Keep in mind that the same key can be used 228 // multiple times. 229 // 230 // The index is used to filter bots by their dimensions in bot listing API. 231 Dimensions []string `gae:"dimensions_flat"` 232 233 // Composite encodes the current state of the bot. 234 // 235 // For datastore performance reasons it encodes multiple aspects of the state 236 // in a single indexed multi-valued field, resulting in a somewhat weird 237 // semantics. 238 // 239 // The slice always have 4 items, with following meaning: 240 // 241 // Composite[0] is one of: 242 // BotStateInMaintenance = 1 << 8 # 256 243 // BotStateNotInMaintenance = 1 << 9 # 512 244 // Composite[1] is one of: 245 // BotStateDead = 1 << 6 # 64 246 // BotStateAlive = 1 << 7 # 128 247 // Composite[2] is one of: 248 // BotStateQuarantined = 1 << 2 # 4 249 // BotStateHealthy = 1 << 3 # 8 250 // Composite[3] is one of: 251 // BotStateBusy = 1 << 0 # 1 252 // BotStateIdle = 1 << 1 # 2 253 Composite []BotStateEnum `gae:"composite"` 254 255 // FirstSeen is when the bot was seen for the first time. 256 FirstSeen time.Time `gae:"first_seen_ts,noindex"` 257 258 // TaskName matches TaskRequest.Name of the task the the bot executes now. 259 // 260 // In other words its the title of the task identified by BotCommon.TaskID. 261 // Empty if the bot is not executing any tasks now. 262 TaskName string `gae:"task_name,noindex"` 263 } 264 265 // BotInfoKey builds a BotInfo key given the bot ID. 266 func BotInfoKey(ctx context.Context, botID string) *datastore.Key { 267 return datastore.NewKey(ctx, "BotInfo", "info", 0, BotRootKey(ctx, botID)) 268 } 269 270 // BotID extracts the bot ID from the entity key. 271 func (b *BotInfo) BotID() string { 272 return b.Key.Parent().StringID() 273 } 274 275 // IsDead is true if this bot is considered dead. 276 func (b *BotInfo) IsDead() bool { 277 return len(b.Composite) > 1 && b.Composite[1] == BotStateDead 278 } 279 280 // IsInMaintenance is true if this bot is in maintenance. 281 func (b *BotInfo) IsInMaintenance() bool { 282 return len(b.Composite) > 0 && b.Composite[0] == BotStateInMaintenance 283 } 284 285 // GetStatus returns the bot status. 286 func (b *BotInfo) GetStatus() string { 287 for _, v := range b.Composite { 288 switch v { 289 case BotStateInMaintenance: 290 return "maintenance" 291 case BotStateQuarantined: 292 return "quarantined" 293 case BotStateDead: 294 return "dead" 295 case BotStateBusy: 296 return "running" 297 } 298 } 299 return "ready" 300 } 301 302 // DimenionsByKey returns a list of dimension values with the given key. 303 func (b *BotInfo) DimenionsByKey(k string) (values []string) { 304 pfx := k + ":" 305 for _, kv := range b.Dimensions { 306 if val, ok := strings.CutPrefix(kv, pfx); ok { 307 values = append(values, val) 308 } 309 } 310 return values 311 } 312 313 // ToProto converts BotInfo to apipb.BotInfo. 314 func (b *BotInfo) ToProto() *apipb.BotInfo { 315 info := &apipb.BotInfo{ 316 BotId: b.BotID(), 317 TaskId: b.TaskID, 318 TaskName: b.TaskName, 319 ExternalIp: b.ExternalIP, 320 AuthenticatedAs: string(b.AuthenticatedAs), 321 IsDead: b.IsDead(), 322 Quarantined: b.Quarantined, 323 MaintenanceMsg: b.Maintenance, 324 Dimensions: dimensionsFlatToPb(b.Dimensions), 325 Version: b.Version, 326 State: string(b.State), 327 } 328 if !b.FirstSeen.IsZero() { 329 info.FirstSeenTs = timestamppb.New(b.FirstSeen) 330 } 331 if ts := b.LastSeen.Get(); !ts.IsZero() { 332 info.LastSeenTs = timestamppb.New(ts) 333 } 334 return info 335 } 336 337 // BotInfoQuery prepares a query that fetches BotInfo entities. 338 func BotInfoQuery() *datastore.Query { 339 return datastore.NewQuery("BotInfo") 340 } 341 342 // FilterBotsByDimensions limits a BotInfo query to return bots matching these 343 // dimensions. 344 // 345 // For complex filters this may split the query into multiple queries that need 346 // to run in parallel with their results merged. See SplitForQuery() in Filter 347 // for more details. 348 func FilterBotsByDimensions(q *datastore.Query, mode SplitMode, dims Filter) []*datastore.Query { 349 return dims.Apply(q, "dimensions_flat", mode) 350 } 351 352 // FilterBotsByState limits a BotInfo query to return bots in particular state. 353 func FilterBotsByState(q *datastore.Query, state StateFilter) *datastore.Query { 354 switch state.Quarantined { 355 case apipb.NullableBool_NULL: 356 // Don't filter. 357 case apipb.NullableBool_TRUE: 358 q = q.Eq("composite", BotStateQuarantined) 359 case apipb.NullableBool_FALSE: 360 q = q.Eq("composite", BotStateHealthy) 361 } 362 363 switch state.InMaintenance { 364 case apipb.NullableBool_NULL: 365 // Don't filter. 366 case apipb.NullableBool_TRUE: 367 q = q.Eq("composite", BotStateInMaintenance) 368 case apipb.NullableBool_FALSE: 369 q = q.Eq("composite", BotStateNotInMaintenance) 370 } 371 372 switch state.IsBusy { 373 case apipb.NullableBool_NULL: 374 // Don't filter. 375 case apipb.NullableBool_TRUE: 376 q = q.Eq("composite", BotStateBusy) 377 case apipb.NullableBool_FALSE: 378 q = q.Eq("composite", BotStateIdle) 379 } 380 381 switch state.IsDead { 382 case apipb.NullableBool_NULL: 383 // Don't filter. 384 case apipb.NullableBool_TRUE: 385 q = q.Eq("composite", BotStateDead) 386 case apipb.NullableBool_FALSE: 387 q = q.Eq("composite", BotStateAlive) 388 } 389 390 return q 391 } 392 393 // BotEvent captures information about the bot during some state transition. 394 // 395 // Entities of this kind are immutable. They essentially form a log with the 396 // bot history. Entries are indexed by the timestamp to allow querying this log 397 // in the chronological order. 398 type BotEvent struct { 399 BotCommon 400 401 // Extra are entity properties that didn't match any declared ones below. 402 // 403 // Should normally be empty. 404 Extra datastore.PropertyMap `gae:"-,extra"` 405 406 // Key identifies the bot and this particular event. 407 // 408 // ID is auto-generated by the datastore. The bot is identified via the 409 // parent key, which can be constructed via BotRootKey(...). 410 Key *datastore.Key `gae:"$key"` 411 412 // Timestamp of when this event happened. 413 // 414 // The index is used in a bunch of places: 415 // 1. For ordering events chronologically when listing them. 416 // 2. Pagination for BQ exports. 417 // 3. Old event cleanup cron. 418 Timestamp time.Time `gae:"ts"` 419 420 // EventType describes what has happened. 421 EventType BotEventType `gae:"event_type,noindex"` 422 423 // Message is an optional free form message associated with the event. 424 Message string `gae:"message,noindex"` 425 426 // Dimensions is a list of dimensions reported by the bot. 427 // 428 // TODO(vadimsh): Stop indexing this after turning down native Swarming 429 // scheduler. This index is only used in has_capacity(...) implementation, 430 // which is a part of the native Swarming scheduler and it not used when 431 // running on top of RBE. This index is pretty big (~6 TB) and getting rid 432 // of it may also speed up the bot event insertion transaction. 433 Dimensions []string `gae:"dimensions_flat"` 434 } 435 436 // ToProto converts BotEvent to apipb.BotEventResponse. 437 func (e *BotEvent) ToProto() *apipb.BotEventResponse { 438 return &apipb.BotEventResponse{ 439 Ts: timestamppb.New(e.Timestamp), 440 EventType: string(e.EventType), 441 Message: e.Message, 442 Dimensions: dimensionsFlatToPb(e.Dimensions), 443 State: string(e.State), 444 ExternalIp: e.ExternalIP, 445 AuthenticatedAs: string(e.AuthenticatedAs), 446 Version: e.Version, 447 Quarantined: e.Quarantined, 448 MaintenanceMsg: e.Maintenance, 449 TaskId: e.TaskID, 450 } 451 } 452 453 // BotEventsQuery prepares a query that fetches BotEvent entities for a bot. 454 // 455 // Most recent events are returned first. 456 func BotEventsQuery(ctx context.Context, botID string) *datastore.Query { 457 return datastore.NewQuery("BotEvent").Ancestor(BotRootKey(ctx, botID)).Order("-ts") 458 } 459 460 // BotDimensions is a map with bot dimensions as `key => [values]`. 461 // 462 // This type represents bot dimensions in the datastore as a JSON-encoded 463 // unindexed blob. There's an alternative "flat" indexed representation as a 464 // list of `key:value` pairs. It is used in BotCommon.Dimensions property. 465 type BotDimensions map[string][]string 466 467 // ToProperty stores the value as a JSON-blob property. 468 func (p *BotDimensions) ToProperty() (datastore.Property, error) { 469 return ToJSONProperty(p) 470 } 471 472 // FromProperty loads a JSON-blob property. 473 func (p *BotDimensions) FromProperty(prop datastore.Property) error { 474 return FromJSONProperty(prop, p) 475 } 476 477 // ToProto returns []apipb.StringListPair, sorted by keys. 478 func (p BotDimensions) ToProto() []*apipb.StringListPair { 479 return MapToStringListPair((map[string][]string)(p), true) 480 } 481 482 // ToStructPB returns a structpb.Struct. 483 func (p BotDimensions) ToStructPB() *structpb.Struct { 484 s := &structpb.Struct{ 485 Fields: make(map[string]*structpb.Value, len(p)), 486 } 487 for key, valList := range p { 488 vals := make([]*structpb.Value, 0, len(valList)) 489 for _, val := range valList { 490 vals = append(vals, &structpb.Value{Kind: &structpb.Value_StringValue{StringValue: val}}) 491 } 492 s.Fields[key] = &structpb.Value{ 493 Kind: &structpb.Value_ListValue{ 494 ListValue: &structpb.ListValue{Values: vals}, 495 }, 496 } 497 } 498 return s 499 }