go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/swarming/server/model/botinfo.go (about)

     1  // Copyright 2023 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package model
    16  
    17  import (
    18  	"context"
    19  	"strings"
    20  	"time"
    21  
    22  	"google.golang.org/protobuf/types/known/structpb"
    23  	"google.golang.org/protobuf/types/known/timestamppb"
    24  
    25  	"go.chromium.org/luci/auth/identity"
    26  	"go.chromium.org/luci/gae/service/datastore"
    27  
    28  	apipb "go.chromium.org/luci/swarming/proto/api_v2"
    29  )
    30  
    31  // BotEventType identifies various known bot events.
    32  type BotEventType string
    33  
    34  // Bot events that happen outside the scope of a task.
    35  const (
    36  	BotEventConnected BotEventType = "bot_connected"
    37  	BotEventError     BotEventType = "bot_error"
    38  	BotEventIdle      BotEventType = "bot_idle"
    39  	BotEventLog       BotEventType = "bot_log"
    40  	BotEventMissing   BotEventType = "bot_missing"
    41  	BotEventPolling   BotEventType = "bot_polling"
    42  	BotEventRebooting BotEventType = "bot_rebooting"
    43  	BotEventShutdown  BotEventType = "bot_shutdown"
    44  	BotEventTerminate BotEventType = "bot_terminate"
    45  )
    46  
    47  // Bot events representing polling outcomes.
    48  const (
    49  	BotEventRestart BotEventType = "request_restart"
    50  	BotEventSleep   BotEventType = "request_sleep"
    51  	BotEventTask    BotEventType = "request_task"
    52  	BotEventUpdate  BotEventType = "request_update"
    53  )
    54  
    55  // Bot events related to running tasks.
    56  const (
    57  	BotEventTaskCompleted BotEventType = "task_completed"
    58  	BotEventTaskError     BotEventType = "task_error"
    59  	BotEventTaskKilled    BotEventType = "task_killed"
    60  	BotEventTaskUpdate    BotEventType = "task_update"
    61  )
    62  
    63  // BotStateEnum is used to represent state of the bot in datastore.
    64  //
    65  // See comment for BotCommon.Composite. Individual values should not leak in any
    66  // public APIs, it is an implementation detail.
    67  type BotStateEnum int64
    68  
    69  // Possible categories of bot state.
    70  const (
    71  	BotStateBusy             BotStateEnum = 1 << 0
    72  	BotStateIdle             BotStateEnum = 1 << 1
    73  	BotStateQuarantined      BotStateEnum = 1 << 2
    74  	BotStateHealthy          BotStateEnum = 1 << 3
    75  	BotStateUnused1          BotStateEnum = 1 << 4
    76  	BotStateUnused2          BotStateEnum = 1 << 5
    77  	BotStateDead             BotStateEnum = 1 << 6
    78  	BotStateAlive            BotStateEnum = 1 << 7
    79  	BotStateInMaintenance    BotStateEnum = 1 << 8
    80  	BotStateNotInMaintenance BotStateEnum = 1 << 9
    81  )
    82  
    83  // StateFilter represents a filter over the possible bot states.
    84  //
    85  // Each field is a filter on one aspect of the bot state with possible values
    86  // being TRUE (meaning "yes"), FALSE (meaning "no") and NULL (meaning "don't
    87  // care").
    88  type StateFilter struct {
    89  	// Quarantined filters bots based on whether they are quarantined.
    90  	Quarantined apipb.NullableBool
    91  	// InMaintenance filters bots based on whether they are in maintenance mode.
    92  	InMaintenance apipb.NullableBool
    93  	// IsDead filters bots based on whether they are connected or not.
    94  	IsDead apipb.NullableBool
    95  	// IsBusy filters bots based on whether they execute any task or not.
    96  	IsBusy apipb.NullableBool
    97  }
    98  
    99  // BotRoot is an entity group root of entities representing a single bot.
   100  //
   101  // Presence of this entity indicates there are BotEvent entities for this bot.
   102  //
   103  // TODO(vadimsh): This entity is unnecessary complication. Old entities cleanup
   104  // should happen via Cloud Datastore TTL feature, then this entity is not
   105  // needed.
   106  type BotRoot struct {
   107  	// Extra are entity properties that didn't match any declared ones below.
   108  	//
   109  	// Should normally be empty.
   110  	Extra datastore.PropertyMap `gae:"-,extra"`
   111  
   112  	// Key is derived based on the bot ID, see BotRootKey.
   113  	Key *datastore.Key `gae:"$key"`
   114  
   115  	// LegacyCurrent is no longer used.
   116  	LegacyCurrent LegacyProperty `gae:"current"`
   117  }
   118  
   119  // BotRootKey is a root key of an entity group with info about a bot.
   120  func BotRootKey(ctx context.Context, botID string) *datastore.Key {
   121  	return datastore.NewKey(ctx, "BotRoot", botID, 0, nil)
   122  }
   123  
   124  // BotCommon contains properties that are common to both BotInfo and BotEvent.
   125  //
   126  // It is not meant to be stored in the datastore on its own, only as an embedded
   127  // struct inside BotInfo or BotEvent.
   128  type BotCommon struct {
   129  	// State is a free form JSON dict with the bot state as reported by the bot.
   130  	//
   131  	// Swarming itself mostly ignores this information, but it is exposed via API
   132  	// and UI, allowing bots to report extended information about themselves to
   133  	// Swarming clients.
   134  	State []byte `gae:"state,noindex"`
   135  
   136  	// ExternalIP is the bot's IP address as seen by the server.
   137  	ExternalIP string `gae:"external_ip,noindex"`
   138  
   139  	// AuthenticatedAs is the bot's credentials as seen by the server.
   140  	AuthenticatedAs identity.Identity `gae:"authenticated_as,noindex"`
   141  
   142  	// Version of the bot code the bot is running.
   143  	Version string `gae:"version,noindex"`
   144  
   145  	// Quarantined means the bot is unhealthy and should not receive tasks.
   146  	//
   147  	// It is set when either:
   148  	// - dimensions['quarantined'] or state['quarantined'] is set by the bot.
   149  	// - API requests from the bot appear to be malformed.
   150  	Quarantined bool `gae:"quarantined,noindex"`
   151  
   152  	// Maintenance message if the bot is in maintenance.
   153  	//
   154  	// Maintenance state, just like quarantined state, means the bot should not
   155  	// receive tasks. The difference is that maintenance is an expected condition:
   156  	//   - The bot moves into maintenance state in expected moments.
   157  	//   - It is expected to be short and end automatically.
   158  	Maintenance string `gae:"maintenance_msg,noindex"`
   159  
   160  	// TaskID is the packed TaskRunResult key of the relevant task, if any.
   161  	//
   162  	// For BotInfo, it identifies the current TaskRunResult being executed by
   163  	// the bot.
   164  	//
   165  	// For BotEvent, it is relevant for event types `request_task`, `task_killed`,
   166  	// `task_completed`, `task_error`.
   167  	//
   168  	// Note that it is **not** a packed TaskResultSummary. This `task_id` ends in
   169  	// `1` instead of `0`.
   170  	//
   171  	// TODO(vadimsh): This is unfortunate, since this field ends up in BQ exports
   172  	// where it causes confusion: task IDs in other BQ exports are "packed
   173  	// TaskResultSummary ID", i.e. end in 0. This complicates joining BQ tables.
   174  	TaskID string `gae:"task_id,noindex"`
   175  
   176  	// LastSeen is the last time the bot contacted the server, if ever.
   177  	//
   178  	// Note that it is unindexed to avoid hotspotting the datastore, see
   179  	// https://chromium.googlesource.com/infra/luci/luci-py/+/4e9aecba
   180  	LastSeen datastore.Optional[time.Time, datastore.Unindexed] `gae:"last_seen_ts"`
   181  
   182  	// IdleSince is when the bot became idle last time, if ever.
   183  	//
   184  	// It is unset when running the task or hooks.
   185  	IdleSince datastore.Optional[time.Time, datastore.Unindexed] `gae:"idle_since_ts"`
   186  
   187  	// LegacyProperties is no longer used.
   188  	LegacyLeaseID LegacyProperty `gae:"lease_id"`
   189  
   190  	// LegacyLeaseExpiration is no longer used.
   191  	LegacyLeaseExpiration LegacyProperty `gae:"lease_expiration_ts"`
   192  
   193  	// LegacyLeasedIndefinitely is no longer used.
   194  	LegacyLeasedIndefinitely LegacyProperty `gae:"leased_indefinitely"`
   195  
   196  	// LegacyMachineType is no longer used.
   197  	LegacyMachineType LegacyProperty `gae:"machine_type"`
   198  
   199  	// LegacyMachineLease is no longer used.
   200  	LegacyMachineLease LegacyProperty `gae:"machine_lease"`
   201  
   202  	// LegacyStateJSON is no longer used.
   203  	LegacyStateJSON LegacyProperty `gae:"state_json"`
   204  
   205  	// LegacyDimensions is no longer used.
   206  	LegacyDimensions LegacyProperty `gae:"dimensions"`
   207  
   208  	// LegacyIsBusy is no longer used.
   209  	LegacyIsBusy LegacyProperty `gae:"is_busy"`
   210  }
   211  
   212  // BotInfo contains the latest information about a bot.
   213  type BotInfo struct {
   214  	BotCommon
   215  
   216  	// Extra are entity properties that didn't match any declared ones below.
   217  	//
   218  	// Should normally be empty.
   219  	Extra datastore.PropertyMap `gae:"-,extra"`
   220  
   221  	// Key is derived based on the bot ID, see BotInfoKey.
   222  	Key *datastore.Key `gae:"$key"`
   223  
   224  	// Dimensions is a list of dimensions reported by the bot.
   225  	//
   226  	// Dimensions are used for task selection. They are encoded as a sorted list
   227  	// of `key:value` strings. Keep in mind that the same key can be used
   228  	// multiple times.
   229  	//
   230  	// The index is used to filter bots by their dimensions in bot listing API.
   231  	Dimensions []string `gae:"dimensions_flat"`
   232  
   233  	// Composite encodes the current state of the bot.
   234  	//
   235  	// For datastore performance reasons it encodes multiple aspects of the state
   236  	// in a single indexed multi-valued field, resulting in a somewhat weird
   237  	// semantics.
   238  	//
   239  	// The slice always have 4 items, with following meaning:
   240  	//
   241  	// Composite[0] is one of:
   242  	//    BotStateInMaintenance    = 1 << 8  # 256
   243  	//    BotStateNotInMaintenance = 1 << 9  # 512
   244  	// Composite[1] is one of:
   245  	//    BotStateDead  = 1 << 6  # 64
   246  	//    BotStateAlive = 1 << 7  # 128
   247  	// Composite[2] is one of:
   248  	//    BotStateQuarantined = 1 << 2  # 4
   249  	//    BotStateHealthy     = 1 << 3  # 8
   250  	// Composite[3] is one of:
   251  	//    BotStateBusy = 1 << 0  # 1
   252  	//    BotStateIdle = 1 << 1  # 2
   253  	Composite []BotStateEnum `gae:"composite"`
   254  
   255  	// FirstSeen is when the bot was seen for the first time.
   256  	FirstSeen time.Time `gae:"first_seen_ts,noindex"`
   257  
   258  	// TaskName matches TaskRequest.Name of the task the the bot executes now.
   259  	//
   260  	// In other words its the title of the task identified by BotCommon.TaskID.
   261  	// Empty if the bot is not executing any tasks now.
   262  	TaskName string `gae:"task_name,noindex"`
   263  }
   264  
   265  // BotInfoKey builds a BotInfo key given the bot ID.
   266  func BotInfoKey(ctx context.Context, botID string) *datastore.Key {
   267  	return datastore.NewKey(ctx, "BotInfo", "info", 0, BotRootKey(ctx, botID))
   268  }
   269  
   270  // BotID extracts the bot ID from the entity key.
   271  func (b *BotInfo) BotID() string {
   272  	return b.Key.Parent().StringID()
   273  }
   274  
   275  // IsDead is true if this bot is considered dead.
   276  func (b *BotInfo) IsDead() bool {
   277  	return len(b.Composite) > 1 && b.Composite[1] == BotStateDead
   278  }
   279  
   280  // IsInMaintenance is true if this bot is in maintenance.
   281  func (b *BotInfo) IsInMaintenance() bool {
   282  	return len(b.Composite) > 0 && b.Composite[0] == BotStateInMaintenance
   283  }
   284  
   285  // GetStatus returns the bot status.
   286  func (b *BotInfo) GetStatus() string {
   287  	for _, v := range b.Composite {
   288  		switch v {
   289  		case BotStateInMaintenance:
   290  			return "maintenance"
   291  		case BotStateQuarantined:
   292  			return "quarantined"
   293  		case BotStateDead:
   294  			return "dead"
   295  		case BotStateBusy:
   296  			return "running"
   297  		}
   298  	}
   299  	return "ready"
   300  }
   301  
   302  // DimenionsByKey returns a list of dimension values with the given key.
   303  func (b *BotInfo) DimenionsByKey(k string) (values []string) {
   304  	pfx := k + ":"
   305  	for _, kv := range b.Dimensions {
   306  		if val, ok := strings.CutPrefix(kv, pfx); ok {
   307  			values = append(values, val)
   308  		}
   309  	}
   310  	return values
   311  }
   312  
   313  // ToProto converts BotInfo to apipb.BotInfo.
   314  func (b *BotInfo) ToProto() *apipb.BotInfo {
   315  	info := &apipb.BotInfo{
   316  		BotId:           b.BotID(),
   317  		TaskId:          b.TaskID,
   318  		TaskName:        b.TaskName,
   319  		ExternalIp:      b.ExternalIP,
   320  		AuthenticatedAs: string(b.AuthenticatedAs),
   321  		IsDead:          b.IsDead(),
   322  		Quarantined:     b.Quarantined,
   323  		MaintenanceMsg:  b.Maintenance,
   324  		Dimensions:      dimensionsFlatToPb(b.Dimensions),
   325  		Version:         b.Version,
   326  		State:           string(b.State),
   327  	}
   328  	if !b.FirstSeen.IsZero() {
   329  		info.FirstSeenTs = timestamppb.New(b.FirstSeen)
   330  	}
   331  	if ts := b.LastSeen.Get(); !ts.IsZero() {
   332  		info.LastSeenTs = timestamppb.New(ts)
   333  	}
   334  	return info
   335  }
   336  
   337  // BotInfoQuery prepares a query that fetches BotInfo entities.
   338  func BotInfoQuery() *datastore.Query {
   339  	return datastore.NewQuery("BotInfo")
   340  }
   341  
   342  // FilterBotsByDimensions limits a BotInfo query to return bots matching these
   343  // dimensions.
   344  //
   345  // For complex filters this may split the query into multiple queries that need
   346  // to run in parallel with their results merged. See SplitForQuery() in Filter
   347  // for more details.
   348  func FilterBotsByDimensions(q *datastore.Query, mode SplitMode, dims Filter) []*datastore.Query {
   349  	return dims.Apply(q, "dimensions_flat", mode)
   350  }
   351  
   352  // FilterBotsByState limits a BotInfo query to return bots in particular state.
   353  func FilterBotsByState(q *datastore.Query, state StateFilter) *datastore.Query {
   354  	switch state.Quarantined {
   355  	case apipb.NullableBool_NULL:
   356  		// Don't filter.
   357  	case apipb.NullableBool_TRUE:
   358  		q = q.Eq("composite", BotStateQuarantined)
   359  	case apipb.NullableBool_FALSE:
   360  		q = q.Eq("composite", BotStateHealthy)
   361  	}
   362  
   363  	switch state.InMaintenance {
   364  	case apipb.NullableBool_NULL:
   365  		// Don't filter.
   366  	case apipb.NullableBool_TRUE:
   367  		q = q.Eq("composite", BotStateInMaintenance)
   368  	case apipb.NullableBool_FALSE:
   369  		q = q.Eq("composite", BotStateNotInMaintenance)
   370  	}
   371  
   372  	switch state.IsBusy {
   373  	case apipb.NullableBool_NULL:
   374  		// Don't filter.
   375  	case apipb.NullableBool_TRUE:
   376  		q = q.Eq("composite", BotStateBusy)
   377  	case apipb.NullableBool_FALSE:
   378  		q = q.Eq("composite", BotStateIdle)
   379  	}
   380  
   381  	switch state.IsDead {
   382  	case apipb.NullableBool_NULL:
   383  		// Don't filter.
   384  	case apipb.NullableBool_TRUE:
   385  		q = q.Eq("composite", BotStateDead)
   386  	case apipb.NullableBool_FALSE:
   387  		q = q.Eq("composite", BotStateAlive)
   388  	}
   389  
   390  	return q
   391  }
   392  
   393  // BotEvent captures information about the bot during some state transition.
   394  //
   395  // Entities of this kind are immutable. They essentially form a log with the
   396  // bot history. Entries are indexed by the timestamp to allow querying this log
   397  // in the chronological order.
   398  type BotEvent struct {
   399  	BotCommon
   400  
   401  	// Extra are entity properties that didn't match any declared ones below.
   402  	//
   403  	// Should normally be empty.
   404  	Extra datastore.PropertyMap `gae:"-,extra"`
   405  
   406  	// Key identifies the bot and this particular event.
   407  	//
   408  	// ID is auto-generated by the datastore. The bot is identified via the
   409  	// parent key, which can be constructed via BotRootKey(...).
   410  	Key *datastore.Key `gae:"$key"`
   411  
   412  	// Timestamp of when this event happened.
   413  	//
   414  	// The index is used in a bunch of places:
   415  	// 1. For ordering events chronologically when listing them.
   416  	// 2. Pagination for BQ exports.
   417  	// 3. Old event cleanup cron.
   418  	Timestamp time.Time `gae:"ts"`
   419  
   420  	// EventType describes what has happened.
   421  	EventType BotEventType `gae:"event_type,noindex"`
   422  
   423  	// Message is an optional free form message associated with the event.
   424  	Message string `gae:"message,noindex"`
   425  
   426  	// Dimensions is a list of dimensions reported by the bot.
   427  	//
   428  	// TODO(vadimsh): Stop indexing this after turning down native Swarming
   429  	// scheduler. This index is only used in has_capacity(...) implementation,
   430  	// which is a part of the native Swarming scheduler and it not used when
   431  	// running on top of RBE. This index is pretty big (~6 TB) and getting rid
   432  	// of it may also speed up the bot event insertion transaction.
   433  	Dimensions []string `gae:"dimensions_flat"`
   434  }
   435  
   436  // ToProto converts BotEvent to apipb.BotEventResponse.
   437  func (e *BotEvent) ToProto() *apipb.BotEventResponse {
   438  	return &apipb.BotEventResponse{
   439  		Ts:              timestamppb.New(e.Timestamp),
   440  		EventType:       string(e.EventType),
   441  		Message:         e.Message,
   442  		Dimensions:      dimensionsFlatToPb(e.Dimensions),
   443  		State:           string(e.State),
   444  		ExternalIp:      e.ExternalIP,
   445  		AuthenticatedAs: string(e.AuthenticatedAs),
   446  		Version:         e.Version,
   447  		Quarantined:     e.Quarantined,
   448  		MaintenanceMsg:  e.Maintenance,
   449  		TaskId:          e.TaskID,
   450  	}
   451  }
   452  
   453  // BotEventsQuery prepares a query that fetches BotEvent entities for a bot.
   454  //
   455  // Most recent events are returned first.
   456  func BotEventsQuery(ctx context.Context, botID string) *datastore.Query {
   457  	return datastore.NewQuery("BotEvent").Ancestor(BotRootKey(ctx, botID)).Order("-ts")
   458  }
   459  
   460  // BotDimensions is a map with bot dimensions as `key => [values]`.
   461  //
   462  // This type represents bot dimensions in the datastore as a JSON-encoded
   463  // unindexed blob. There's an alternative "flat" indexed representation as a
   464  // list of `key:value` pairs. It is used in BotCommon.Dimensions property.
   465  type BotDimensions map[string][]string
   466  
   467  // ToProperty stores the value as a JSON-blob property.
   468  func (p *BotDimensions) ToProperty() (datastore.Property, error) {
   469  	return ToJSONProperty(p)
   470  }
   471  
   472  // FromProperty loads a JSON-blob property.
   473  func (p *BotDimensions) FromProperty(prop datastore.Property) error {
   474  	return FromJSONProperty(prop, p)
   475  }
   476  
   477  // ToProto returns []apipb.StringListPair, sorted by keys.
   478  func (p BotDimensions) ToProto() []*apipb.StringListPair {
   479  	return MapToStringListPair((map[string][]string)(p), true)
   480  }
   481  
   482  // ToStructPB returns a structpb.Struct.
   483  func (p BotDimensions) ToStructPB() *structpb.Struct {
   484  	s := &structpb.Struct{
   485  		Fields: make(map[string]*structpb.Value, len(p)),
   486  	}
   487  	for key, valList := range p {
   488  		vals := make([]*structpb.Value, 0, len(valList))
   489  		for _, val := range valList {
   490  			vals = append(vals, &structpb.Value{Kind: &structpb.Value_StringValue{StringValue: val}})
   491  		}
   492  		s.Fields[key] = &structpb.Value{
   493  			Kind: &structpb.Value_ListValue{
   494  				ListValue: &structpb.ListValue{Values: vals},
   495  			},
   496  		}
   497  	}
   498  	return s
   499  }