go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/swarming/server/rbe/session.go (about)

     1  // Copyright 2023 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package rbe implements communication with RBE APIs.
    16  package rbe
    17  
    18  import (
    19  	"context"
    20  	"fmt"
    21  	"math/rand"
    22  	"sort"
    23  	"time"
    24  
    25  	statuspb "google.golang.org/genproto/googleapis/rpc/status"
    26  	"google.golang.org/grpc"
    27  	"google.golang.org/grpc/codes"
    28  	"google.golang.org/grpc/status"
    29  	"google.golang.org/protobuf/types/known/anypb"
    30  	"google.golang.org/protobuf/types/known/timestamppb"
    31  
    32  	"go.chromium.org/luci/common/clock"
    33  	"go.chromium.org/luci/common/errors"
    34  	"go.chromium.org/luci/common/logging"
    35  
    36  	"go.chromium.org/luci/swarming/internal/remoteworkers"
    37  	internalspb "go.chromium.org/luci/swarming/proto/internals"
    38  	"go.chromium.org/luci/swarming/server/botsrv"
    39  	"go.chromium.org/luci/swarming/server/hmactoken"
    40  )
    41  
    42  // SessionServer serves handlers for creating and updating RBE bot sessions.
    43  type SessionServer struct {
    44  	rbe        remoteworkers.BotsClient
    45  	hmacSecret *hmactoken.Secret // to generate session tokens
    46  }
    47  
    48  // NewSessionServer creates a new session server given an RBE client connection.
    49  func NewSessionServer(ctx context.Context, cc []grpc.ClientConnInterface, hmacSecret *hmactoken.Secret) *SessionServer {
    50  	return &SessionServer{
    51  		rbe:        botsConnectionPool(cc),
    52  		hmacSecret: hmacSecret,
    53  	}
    54  }
    55  
    56  ////////////////////////////////////////////////////////////////////////////////
    57  // Structs used by all handlers.
    58  
    59  // WorkerProperties are RBE worker properties unrelated to actual scheduling.
    60  //
    61  // They aren't validated by Swarming and just passed along to RBE. The RBE bots
    62  // obtain them via some external mechanism (e.g. the GCE metadata server).
    63  //
    64  // They are optional and currently used only on bots managed by RBE Worker
    65  // Provider.
    66  type WorkerProperties struct {
    67  	// PoolID will be used as `rbePoolID` bot session property.
    68  	PoolID string `json:"pool_id"`
    69  	// PoolVersion will be used as `rbePoolVersion` bot session property.
    70  	PoolVersion string `json:"pool_version"`
    71  }
    72  
    73  ////////////////////////////////////////////////////////////////////////////////
    74  // CreateBotSession handler.
    75  
    76  // CreateBotSessionRequest is a body of `/bot/rbe/session/create` request.
    77  type CreateBotSessionRequest struct {
    78  	// PollToken is a token produced by Python server in `/bot/poll`. Required.
    79  	//
    80  	// This token encodes configuration of the bot maintained by the Python
    81  	// Swarming server.
    82  	PollToken []byte `json:"poll_token"`
    83  
    84  	// SessionToken is a session token of a previous session if recreating it.
    85  	//
    86  	// Optional. See the corresponding field in UpdateBotSessionRequest.
    87  	SessionToken []byte `json:"session_token,omitempty"`
    88  
    89  	// Dimensions is dimensions reported by the bot. Required.
    90  	Dimensions map[string][]string `json:"dimensions"`
    91  
    92  	// BotVersion identifies the bot software. It is reported to RBE as is.
    93  	BotVersion string `json:"bot_version,omitempty"`
    94  
    95  	// WorkerProperties are passed to RBE as worker properties.
    96  	WorkerProperties *WorkerProperties `json:"worker_properties,omitempty"`
    97  }
    98  
    99  func (r *CreateBotSessionRequest) ExtractPollToken() []byte               { return r.PollToken }
   100  func (r *CreateBotSessionRequest) ExtractSessionToken() []byte            { return r.SessionToken }
   101  func (r *CreateBotSessionRequest) ExtractDimensions() map[string][]string { return r.Dimensions }
   102  
   103  func (r *CreateBotSessionRequest) ExtractDebugRequest() any {
   104  	return &CreateBotSessionRequest{
   105  		Dimensions:       r.Dimensions,
   106  		BotVersion:       r.BotVersion,
   107  		WorkerProperties: r.WorkerProperties,
   108  	}
   109  }
   110  
   111  // CreateBotSessionResponse is a body of `/bot/rbe/session/create` response.
   112  type CreateBotSessionResponse struct {
   113  	// SessionToken is a freshly produced session token.
   114  	//
   115  	// It encodes the RBE bot session ID and bot configuration provided via the
   116  	// poll token.
   117  	//
   118  	// The session token is needed to call `/bot/rbe/session/update`. This call
   119  	// also will periodically refresh it.
   120  	SessionToken []byte `json:"session_token"`
   121  
   122  	// SessionExpiry is when this session expires, as Unix timestamp in seconds.
   123  	//
   124  	// The bot should call `/bot/rbe/session/update` before that time.
   125  	SessionExpiry int64 `json:"session_expiry"`
   126  
   127  	// SessionID is an RBE bot session ID as encoded in the token.
   128  	//
   129  	// Primarily for the bot debug log.
   130  	SessionID string `json:"session_id"`
   131  }
   132  
   133  // CreateBotSession is an RPC handler that creates a new bot session.
   134  func (srv *SessionServer) CreateBotSession(ctx context.Context, body *CreateBotSessionRequest, r *botsrv.Request) (botsrv.Response, error) {
   135  	// Actually open the session. This should not block, since we aren't picking
   136  	// up any tasks yet (indicated by INITIALIZING status).
   137  	session, err := srv.rbe.CreateBotSession(ctx, &remoteworkers.CreateBotSessionRequest{
   138  		Parent:     r.PollState.RbeInstance,
   139  		BotSession: rbeBotSession("", remoteworkers.BotStatus_INITIALIZING, r.Dimensions, body.BotVersion, body.WorkerProperties, nil),
   140  	})
   141  	if err != nil {
   142  		// Return the exact same gRPC error in a reply. This is fine, we trust the
   143  		// bot, it has already been authorized. It is useful for debugging to see
   144  		// the original RBE errors in the bot logs.
   145  		return nil, err
   146  	}
   147  	logging.Infof(ctx, "%s: %s", r.BotID, session.Name)
   148  	for _, lease := range session.Leases {
   149  		logging.Errorf(ctx, "Unexpected lease when just opening the session: %s", lease)
   150  	}
   151  
   152  	// Return the token that wraps the session ID. The bot will use it when
   153  	// calling `/bot/rbe/session/update`.
   154  	sessionToken, tokenExpiry, err := srv.genSessionToken(ctx, r.PollState, session.Name)
   155  	if err != nil {
   156  		return nil, status.Errorf(codes.Internal, "could not generate session token: %s", err)
   157  	}
   158  	return &CreateBotSessionResponse{
   159  		SessionToken:  sessionToken,
   160  		SessionExpiry: tokenExpiry.Unix(),
   161  		SessionID:     session.Name,
   162  	}, nil
   163  }
   164  
   165  ////////////////////////////////////////////////////////////////////////////////
   166  // UpdateBotSession handler.
   167  
   168  // Lease is a JSON representation of a relevant subset of remoteworkers.Lease.
   169  type Lease struct {
   170  	// ID is the unique reservation ID treated as an opaque string. Required.
   171  	ID string `json:"id"`
   172  
   173  	// State is a lease state as stringy remoteworkers.LeaseState enum. Required.
   174  	//
   175  	// Possible values:
   176  	//   * PENDING
   177  	//   * ACTIVE
   178  	//   * COMPLETED
   179  	//   * CANCELLED
   180  	State string `json:"state"`
   181  
   182  	// Payload is the reservation payload.
   183  	//
   184  	// Note it is serialized using regular JSON rules, i.e. fields are in
   185  	// "snake_case".
   186  	Payload *internalspb.TaskPayload `json:"payload,omitempty"`
   187  
   188  	// Result is the execution result.
   189  	//
   190  	// Note it is serialized using regular JSON rules, i.e. fields are in
   191  	// "snake_case".
   192  	Result *internalspb.TaskResult `json:"result,omitempty"`
   193  }
   194  
   195  // UpdateBotSessionRequest is a body of `/bot/rbe/session/update` request.
   196  //
   197  // If PollToken is present, it will be used to refresh the state stored in the
   198  // session token.
   199  type UpdateBotSessionRequest struct {
   200  	// SessionToken is a token returned by the previous API call. Required.
   201  	//
   202  	// This token is initially returned by `/bot/rbe/session/create` and then
   203  	// refreshed with every `/bot/rbe/session/update` call.
   204  	SessionToken []byte `json:"session_token"`
   205  
   206  	// PollToken is a token produced by Python server in `/bot/poll`.
   207  	//
   208  	// It is optional and present only in the outer bot poll loop, when the bot
   209  	// polls both Python Swarming server (to get new configs) and Swarming RBE
   210  	// server (to get new tasks).
   211  	//
   212  	// Internals of this token will be copied into the session token returned in
   213  	// the response to this call.
   214  	PollToken []byte `json:"poll_token,omitempty"`
   215  
   216  	// Dimensions is dimensions reported by the bot. Required.
   217  	Dimensions map[string][]string `json:"dimensions"`
   218  
   219  	// BotVersion identifies the bot software. It is reported to RBE as is.
   220  	BotVersion string `json:"bot_version,omitempty"`
   221  
   222  	// WorkerProperties are passed to RBE as worker properties.
   223  	WorkerProperties *WorkerProperties `json:"worker_properties,omitempty"`
   224  
   225  	// The intended bot session status as stringy remoteworkers.BotStatus enum.
   226  	//
   227  	// Possible values:
   228  	//   * OK
   229  	//   * UNHEALTHY
   230  	//   * HOST_REBOOTING
   231  	//   * BOT_TERMINATING
   232  	//   * INITIALIZING
   233  	//   * MAINTENANCE
   234  	Status string `json:"status"`
   235  
   236  	// Nonblocking is true if the bot doesn't want to block waiting for new
   237  	// leases to appear.
   238  	Nonblocking bool `json:"nonblocking"`
   239  
   240  	// The lease the bot is currently working or have just finished working on.
   241  	//
   242  	// Allowed lease states here are:
   243  	//   * ACTIVE: the bot is still working on the lease.
   244  	//   * COMPLETED: the bot has finished working on the lease. Result field
   245  	//     should be populated. This state is also used to report the bot is done
   246  	//     working on a canceled lease.
   247  	//
   248  	// Payload field is always ignored.
   249  	Lease *Lease `json:"lease,omitempty"`
   250  }
   251  
   252  func (r *UpdateBotSessionRequest) ExtractPollToken() []byte               { return r.PollToken }
   253  func (r *UpdateBotSessionRequest) ExtractSessionToken() []byte            { return r.SessionToken }
   254  func (r *UpdateBotSessionRequest) ExtractDimensions() map[string][]string { return r.Dimensions }
   255  
   256  func (r *UpdateBotSessionRequest) ExtractDebugRequest() any {
   257  	return &UpdateBotSessionRequest{
   258  		Dimensions:       r.Dimensions,
   259  		BotVersion:       r.BotVersion,
   260  		WorkerProperties: r.WorkerProperties,
   261  		Status:           r.Status,
   262  		Nonblocking:      r.Nonblocking,
   263  		Lease:            r.Lease,
   264  	}
   265  }
   266  
   267  // UpdateBotSessionResponse is a body of `/bot/rbe/session/update` response.
   268  type UpdateBotSessionResponse struct {
   269  	// SessionToken is a refreshed session token, if available.
   270  	//
   271  	// It carries the same RBE bot session ID inside as the incoming token. The
   272  	// bot must use it in the next `/bot/rbe/session/update` request.
   273  	//
   274  	// If the incoming token has expired already, this field will be empty, since
   275  	// it is not possible to refresh an expired token.
   276  	SessionToken []byte `json:"session_token,omitempty"`
   277  
   278  	// SessionExpiry is when this session expires, as Unix timestamp in seconds.
   279  	//
   280  	// The bot should call `/bot/rbe/session/update` again before that time.
   281  	//
   282  	// If the session token has expired already, this field will be empty.
   283  	SessionExpiry int64 `json:"session_expiry,omitempty"`
   284  
   285  	// The session status as seen by the server, as remoteworkers.BotStatus enum.
   286  	//
   287  	// Possible values:
   288  	//   * OK: if the session is healthy.
   289  	//   * BOT_TERMINATING: if the session has expired.
   290  	Status string `json:"status"`
   291  
   292  	// The lease the bot should be working on or should cancel now, if any.
   293  	//
   294  	// Possible lease states here:
   295  	//   * PENDING: the bot should start working on this new lease. It has Payload
   296  	//     field populated. Can only happen in reply to a bot reporting no lease
   297  	//     or a completed lease.
   298  	//   * ACTIVE: the bot should keep working on the lease it just reported. Can
   299  	//     only happen in reply to a bot reporting an active lease. Payload is not
   300  	//     populate (the bot should know it already).
   301  	//   * CANCELLED: the bot should stop working on the lease it just reported.
   302  	//     Once the bot is done working on the lease, it should update the session
   303  	//     again, marking the lease as COMPLETED. Payload is not populated.
   304  	//
   305  	// If the bot was stuck for a while and the RBE canceled the lease as lost,
   306  	// this field will be unpopulated, even if the bot reported an active lease.
   307  	// The bot should give up on the current lease ASAP, without even reporting
   308  	// its result back (because the server gave up on it already anyway).
   309  	Lease *Lease `json:"lease,omitempty"`
   310  }
   311  
   312  // UpdateBotSession is an RPC handler that updates a bot session.
   313  func (srv *SessionServer) UpdateBotSession(ctx context.Context, body *UpdateBotSessionRequest, r *botsrv.Request) (botsrv.Response, error) {
   314  	if r.SessionID == "" {
   315  		// This can happen if the bot got stuck for a long time and its session
   316  		// token has expired. Its RBE session likely has expired as well. Return the
   317  		// corresponding response to let the bot know it needs to recreate
   318  		// the session.
   319  		if r.SessionTokenExpired {
   320  			logging.Warningf(ctx, "%s: expired session token", r.BotID)
   321  			logSession(ctx, "Input", body.Status, body.Lease)
   322  			resp := &UpdateBotSessionResponse{
   323  				Status: remoteworkers.BotStatus_name[int32(remoteworkers.BotStatus_BOT_TERMINATING)],
   324  			}
   325  			logSession(ctx, "Output", resp.Status, nil)
   326  			return resp, nil
   327  		}
   328  		// This can happen if the session token was omitted in the request. This is
   329  		// not allowed.
   330  		return nil, status.Errorf(codes.InvalidArgument, "missing session ID")
   331  	}
   332  
   333  	logging.Infof(ctx, "%s: %s", r.BotID, r.SessionID)
   334  	logSession(ctx, "Input", body.Status, body.Lease)
   335  
   336  	// Need a recognizable status enum.
   337  	botStatus := remoteworkers.BotStatus(remoteworkers.BotStatus_value[body.Status])
   338  	if botStatus == remoteworkers.BotStatus_BOT_STATUS_UNSPECIFIED {
   339  		if body.Status == "" {
   340  			return nil, status.Errorf(codes.InvalidArgument, "missing session status")
   341  		}
   342  		return nil, status.Errorf(codes.InvalidArgument, "unrecognized session status %q", body.Status)
   343  	}
   344  
   345  	// Convert our Lease to the RBE remoteworkers.Lease. We expect only ACTIVE or
   346  	// COMPLETED leases, see UpdateBotSessionRequest comment.
   347  	var leaseIn *remoteworkers.Lease
   348  	if body.Lease != nil {
   349  		leaseIn = &remoteworkers.Lease{
   350  			Id:    body.Lease.ID,
   351  			State: remoteworkers.LeaseState(remoteworkers.LeaseState_value[body.Lease.State]),
   352  		}
   353  		switch leaseIn.State {
   354  		case remoteworkers.LeaseState_ACTIVE:
   355  			// This is a "keep alive" update.
   356  		case remoteworkers.LeaseState_COMPLETED:
   357  			// This is a result-reporting update. Populate the result, if any.
   358  			leaseIn.Status = &statuspb.Status{} // means "OK"
   359  			if body.Lease.Result != nil {
   360  				var err error
   361  				if leaseIn.Result, err = anypb.New(body.Lease.Result); err != nil {
   362  					return nil, status.Errorf(codes.Internal, "failed to serialize TaskResult: %s", err)
   363  				}
   364  			}
   365  		case remoteworkers.LeaseState_LEASE_STATE_UNSPECIFIED:
   366  			if body.Lease.State == "" {
   367  				return nil, status.Errorf(codes.InvalidArgument, "missing lease state")
   368  			}
   369  			return nil, status.Errorf(codes.InvalidArgument, "unrecognized lease state %q", body.Lease.State)
   370  		default:
   371  			return nil, status.Errorf(codes.InvalidArgument, "unexpected lease state %q", body.Lease.State)
   372  		}
   373  	}
   374  
   375  	// If there are no pending leases, RBE seems to block for `<rpc deadline>-10s`
   376  	// (not doing anything at all if the RPC deadline is less than 10s).
   377  	var timeout time.Duration
   378  	if body.Nonblocking {
   379  		// RPCs with timeout of less that 10s are treated by RBE as non-blocking.
   380  		// Note the timeout is propagated via gRPC metadata headers, it is like an
   381  		// implicit RPC parameters. This should be pretty deterministic.
   382  		timeout = 9 * time.Second
   383  	} else {
   384  		// Since we are running on GAE, we are limited by 1m total. Tell RBE we
   385  		// have ~50s, it will block for ~40s, giving us ~20s of spare time.
   386  		//
   387  		// Randomize this timeout a bit to avoid freshly restarted bots call us
   388  		// in synchronized "waves".
   389  		//
   390  		// TODO(vadimsh): This needs more tuning, in particular in combination with
   391  		// GAE's `max_concurrent_requests` parameter.
   392  		timeout = randomDuration(45*time.Second, 55*time.Second)
   393  	}
   394  
   395  	rpcCtx, cancel := context.WithTimeout(ctx, timeout)
   396  	defer cancel()
   397  
   398  	session, err := srv.rbe.UpdateBotSession(rpcCtx, &remoteworkers.UpdateBotSessionRequest{
   399  		Name:       r.SessionID,
   400  		BotSession: rbeBotSession(r.SessionID, botStatus, r.Dimensions, body.BotVersion, body.WorkerProperties, leaseIn),
   401  	})
   402  
   403  	if err != nil {
   404  		// If the bot was just polling for new work, treat DEADLINE_EXCEEDED as
   405  		// "no work available". Otherwise we may end up replying with a lot of
   406  		// errors and GAE treats this as a signal that the instance is unhealthy
   407  		// and kills it.
   408  		if status.Code(err) == codes.DeadlineExceeded && leaseIn == nil && botStatus == remoteworkers.BotStatus_OK {
   409  			logging.Warningf(ctx, "Deadline exceeded when polling for new leases")
   410  			sessionToken, tokenExpiry, err := srv.genSessionToken(ctx, r.PollState, r.SessionID)
   411  			if err != nil {
   412  				return nil, status.Errorf(codes.Internal, "could not generate session token: %s", err)
   413  			}
   414  			return &UpdateBotSessionResponse{
   415  				SessionToken:  sessionToken,
   416  				SessionExpiry: tokenExpiry.Unix(),
   417  				Status:        "OK",
   418  			}, nil
   419  		}
   420  		// Return the exact same gRPC error in a reply. This is fine, we trust the
   421  		// bot, it has already been authorized. It is useful for debugging to see
   422  		// the original RBE errors.
   423  		return nil, err
   424  	}
   425  
   426  	// The RBE backend always replies with either OK or BOT_TERMINATING status.
   427  	// Note that it replies with OK status even if we told it we want the session
   428  	// terminated. The only time it replies with BOT_TERMINATING is when the
   429  	// session was *already* dead (either closed by the bot previously or timed
   430  	// out by the RBE server).
   431  	acceptingLeases := botStatus == remoteworkers.BotStatus_OK
   432  	switch session.Status {
   433  	case remoteworkers.BotStatus_OK:
   434  		// Do nothing. This is fine. Trust `botStatus` was applied.
   435  	case remoteworkers.BotStatus_BOT_TERMINATING:
   436  		// The session was already closed previously.
   437  		acceptingLeases = false
   438  	default: // i.e. all other "unhealthy" or "not ready" statuses
   439  		logging.Errorf(ctx, "Unexpected status change from RBE: %s => %s", botStatus, session.Status)
   440  		acceptingLeases = false
   441  	}
   442  
   443  	if !acceptingLeases {
   444  		// RBE should not assign leases to a terminating or unhealthy bot.
   445  		for _, lease := range session.Leases {
   446  			logging.Errorf(ctx, "Unexpected RBE lease: %s", lease)
   447  		}
   448  		session.Leases = nil
   449  	}
   450  
   451  	// The lease we'll report to the bot.
   452  	var leaseOut *remoteworkers.Lease
   453  	var leasePayload *internalspb.TaskPayload
   454  
   455  	// If a bot reported an ACTIVE lease the RBE server should either ack it as
   456  	// ACTIVE as well or report it as CANCELED. Additionally if the bot was stuck
   457  	// and didn't ping the lease in a while, the RBE server marks the lease as
   458  	// lost and silently ignores it, i.e. doesn't return it in session.Leases.
   459  	if leaseIn != nil && leaseIn.State == remoteworkers.LeaseState_ACTIVE {
   460  		// Find the reported lease in the response. There should be no other leases.
   461  		for _, lease := range session.Leases {
   462  			if lease.Id == leaseIn.Id {
   463  				leaseOut = lease
   464  				if leaseOut.State != remoteworkers.LeaseState_ACTIVE && leaseOut.State != remoteworkers.LeaseState_CANCELLED {
   465  					return nil, status.Errorf(codes.Internal, "unexpected ACTIVE lease state transition to %s", leaseOut.State)
   466  				}
   467  				if leaseOut.Payload != nil {
   468  					logging.Errorf(ctx, "Unexpected payload in the lease, dropping it")
   469  					leaseOut.Payload = nil
   470  				}
   471  			} else {
   472  				logging.Errorf(ctx, "Unexpected RBE lease: %s", lease)
   473  			}
   474  		}
   475  		if leaseOut == nil {
   476  			logging.Warningf(ctx, "The bot lost the lease")
   477  		}
   478  	}
   479  
   480  	// If a bot reported no lease at all or a COMPLETED lease, the server should
   481  	// return at most one new lease in PENDING state with its payload populated.
   482  	if leaseIn == nil || leaseIn.State == remoteworkers.LeaseState_COMPLETED {
   483  		// Fish out a PENDING lease, if any, ignoring everything else (there should
   484  		// not be anything else there).
   485  		for _, lease := range session.Leases {
   486  			if leaseOut != nil {
   487  				logging.Errorf(ctx, "Unexpected RBE lease: %s", lease)
   488  				continue
   489  			}
   490  			if lease.State == remoteworkers.LeaseState_PENDING {
   491  				leaseOut = lease
   492  			} else {
   493  				logging.Errorf(ctx, "Unexpected non-pending RBE lease: %s", lease)
   494  			}
   495  		}
   496  		if leaseOut != nil {
   497  			// Check this PENDING lease has the payload in a format we understand.
   498  			leasePayload = &internalspb.TaskPayload{}
   499  			if err := leaseOut.Payload.UnmarshalTo(leasePayload); err != nil {
   500  				// TODO(vadimsh): This is a fatally broken task with missing or
   501  				// unrecognized payload, need to tell the RBE to drop it otherwise it
   502  				// will haunt this bot until its expiration.
   503  				logging.Errorf(ctx, "Failed to unmarshal lease payload:\n%s", prettyProto(leaseOut))
   504  				return nil, status.Errorf(codes.Internal, "failed to unmarshal pending lease payload: %s", err)
   505  			}
   506  		}
   507  	}
   508  
   509  	// Convert the output lease to the API response form.
   510  	var respLease *Lease
   511  	if leaseOut != nil {
   512  		respLease = &Lease{
   513  			ID:      leaseOut.Id,
   514  			State:   remoteworkers.LeaseState_name[int32(leaseOut.State)],
   515  			Payload: leasePayload,
   516  		}
   517  	}
   518  
   519  	// Refresh the session token and embed new, potentially updated, PollState
   520  	// into it. Note that generating this token is just a local HMAC operation,
   521  	// which is super fast so its fine to do it on every response.
   522  	sessionToken, tokenExpiry, err := srv.genSessionToken(ctx, r.PollState, r.SessionID)
   523  	if err != nil {
   524  		return nil, status.Errorf(codes.Internal, "could not generate session token: %s", err)
   525  	}
   526  	resp := &UpdateBotSessionResponse{
   527  		SessionToken:  sessionToken,
   528  		SessionExpiry: tokenExpiry.Unix(),
   529  		Status:        remoteworkers.BotStatus_name[int32(session.Status)],
   530  		Lease:         respLease,
   531  	}
   532  	logSession(ctx, "Output", resp.Status, resp.Lease)
   533  	return resp, nil
   534  }
   535  
   536  ////////////////////////////////////////////////////////////////////////////////
   537  // Helpers.
   538  
   539  // sessionTokenExpiry puts a limit on how seldom an active bot can call Swarming
   540  // RBE endpoints.
   541  //
   542  // Healthy bots will never ever hit this limit, they call an endpoint every few
   543  // minutes.
   544  //
   545  // Note that RBE's BotSession proto also has ExpireTime field, but it appears
   546  // it is never populated.
   547  const sessionTokenExpiry = 4 * time.Hour
   548  
   549  // genSessionToken generates a new session token.
   550  func (srv *SessionServer) genSessionToken(ctx context.Context, ps *internalspb.PollState, rbeSessionID string) (tok []byte, expiry time.Time, err error) {
   551  	if rbeSessionID == "" {
   552  		return nil, time.Time{}, errors.Reason("RBE session ID is unexpectedly missing").Err()
   553  	}
   554  	expiry = clock.Now(ctx).Add(sessionTokenExpiry).Round(time.Second)
   555  	blob, err := srv.hmacSecret.GenerateToken(&internalspb.BotSession{
   556  		RbeBotSessionId: rbeSessionID,
   557  		PollState:       ps,
   558  		Expiry:          timestamppb.New(expiry),
   559  	})
   560  	if err != nil {
   561  		return nil, time.Time{}, err
   562  	}
   563  	return blob, expiry, nil
   564  }
   565  
   566  // rbeBotSession constructs remoteworkers.BotSession based on validated bot
   567  // dimensions and the current lease.
   568  func rbeBotSession(
   569  	sessionID string,
   570  	status remoteworkers.BotStatus,
   571  	dims map[string][]string,
   572  	botVersion string,
   573  	workerProps *WorkerProperties,
   574  	lease *remoteworkers.Lease,
   575  ) *remoteworkers.BotSession {
   576  	var props []*remoteworkers.Device_Property
   577  	var botID string
   578  
   579  	// Note that at this point `dims` are validated already by botsrv.Server and
   580  	// we can panic on unexpected values.
   581  	for key, values := range dims {
   582  		if key == "id" {
   583  			if len(values) != 1 {
   584  				panic(fmt.Sprintf("unexpected `id` dimension values: %v", values))
   585  			}
   586  			botID = values[0]
   587  		} else {
   588  			for _, val := range values {
   589  				props = append(props, &remoteworkers.Device_Property{
   590  					Key:   "label:" + key,
   591  					Value: val,
   592  				})
   593  			}
   594  		}
   595  	}
   596  	if botID == "" {
   597  		panic("bot ID is missing in dimensions")
   598  	}
   599  
   600  	// Sort to make logging output more stable and to simplify tests.
   601  	sort.Slice(props, func(i, j int) bool {
   602  		if props[i].Key == props[j].Key {
   603  			return props[i].Value < props[j].Value
   604  		}
   605  		return props[i].Key < props[j].Key
   606  	})
   607  
   608  	// These are used to associated the RBE worker with its worker provider pool.
   609  	var workerPropsList []*remoteworkers.Worker_Property
   610  	if workerProps != nil {
   611  		if workerProps.PoolID != "" {
   612  			workerPropsList = append(workerPropsList, &remoteworkers.Worker_Property{
   613  				Key:   "rbePoolID",
   614  				Value: workerProps.PoolID,
   615  			})
   616  		}
   617  		if workerProps.PoolVersion != "" {
   618  			workerPropsList = append(workerPropsList, &remoteworkers.Worker_Property{
   619  				Key:   "rbePoolVersion",
   620  				Value: workerProps.PoolVersion,
   621  			})
   622  		}
   623  	}
   624  
   625  	var leases []*remoteworkers.Lease
   626  	if lease != nil {
   627  		leases = []*remoteworkers.Lease{lease}
   628  	}
   629  
   630  	return &remoteworkers.BotSession{
   631  		BotId:   botID,
   632  		Name:    sessionID,
   633  		Version: botVersion,
   634  		Status:  status,
   635  		Leases:  leases,
   636  		Worker: &remoteworkers.Worker{
   637  			Properties: workerPropsList,
   638  			Devices: []*remoteworkers.Device{
   639  				{
   640  					Handle:     "primary",
   641  					Properties: props,
   642  				},
   643  			},
   644  		},
   645  	}
   646  }
   647  
   648  // randomDuration returns a uniformly distributed random number in range [a, b).
   649  func randomDuration(a, b time.Duration) time.Duration {
   650  	return a + time.Duration(rand.Int63n(int64(b-a)))
   651  }
   652  
   653  // logSession logs some basic information about the session.
   654  func logSession(ctx context.Context, direction, status string, lease *Lease) {
   655  	if lease != nil {
   656  		logging.Infof(ctx, "%s: %s, lease %s %s", direction, status, lease.State, lease.ID)
   657  	} else {
   658  		logging.Infof(ctx, "%s: %s, no lease", direction, status)
   659  	}
   660  }