go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/swarming/server/rbe/reservation_test.go (about)

     1  // Copyright 2023 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package rbe
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"testing"
    21  	"time"
    22  
    23  	"google.golang.org/grpc"
    24  	"google.golang.org/grpc/codes"
    25  	"google.golang.org/grpc/status"
    26  	"google.golang.org/protobuf/proto"
    27  	"google.golang.org/protobuf/types/known/anypb"
    28  	"google.golang.org/protobuf/types/known/durationpb"
    29  	"google.golang.org/protobuf/types/known/emptypb"
    30  	"google.golang.org/protobuf/types/known/timestamppb"
    31  
    32  	"go.chromium.org/luci/common/clock"
    33  	"go.chromium.org/luci/common/clock/testclock"
    34  	"go.chromium.org/luci/common/retry/transient"
    35  	"go.chromium.org/luci/gae/impl/memory"
    36  	"go.chromium.org/luci/gae/service/datastore"
    37  	"go.chromium.org/luci/server/tq"
    38  
    39  	"go.chromium.org/luci/swarming/internal/remoteworkers"
    40  	internalspb "go.chromium.org/luci/swarming/proto/internals"
    41  	"go.chromium.org/luci/swarming/server/model"
    42  
    43  	. "github.com/smartystreets/goconvey/convey"
    44  	. "go.chromium.org/luci/common/testing/assertions"
    45  )
    46  
    47  func TestReservationServer(t *testing.T) {
    48  	t.Parallel()
    49  
    50  	Convey("With mocks", t, func() {
    51  		const rbeInstance = "projects/x/instances/y"
    52  		const rbeReservation = "reservation-id"
    53  
    54  		ctx := memory.Use(context.Background())
    55  		ctx, _ = testclock.UseTime(ctx, testclock.TestRecentTimeUTC)
    56  		rbe := mockedReservationClient{
    57  			newState: remoteworkers.ReservationState_RESERVATION_PENDING,
    58  		}
    59  		internals := mockedInternalsClient{}
    60  		srv := ReservationServer{
    61  			rbe:           &rbe,
    62  			internals:     &internals,
    63  			serverVersion: "go-version",
    64  		}
    65  
    66  		expirationTimeout := time.Hour
    67  		executionTimeout := 10 * time.Minute
    68  		expiry := clock.Now(ctx).Add(expirationTimeout).UTC()
    69  
    70  		enqueueTask := &internalspb.EnqueueRBETask{
    71  			Payload: &internalspb.TaskPayload{
    72  				ReservationId:  rbeReservation,
    73  				TaskId:         "60b2ed0a43023110",
    74  				TaskToRunShard: 14,
    75  				TaskToRunId:    1,
    76  				DebugInfo: &internalspb.TaskPayload_DebugInfo{
    77  					PySwarmingVersion: "py-version",
    78  				},
    79  			},
    80  			RbeInstance:      rbeInstance,
    81  			Expiry:           timestamppb.New(expiry),
    82  			ExecutionTimeout: durationpb.New(executionTimeout),
    83  			RequestedBotId:   "some-bot-id",
    84  			Constraints: []*internalspb.EnqueueRBETask_Constraint{
    85  				{Key: "key1", AllowedValues: []string{"v1", "v2"}},
    86  				{Key: "key2", AllowedValues: []string{"v3"}},
    87  			},
    88  			Priority: 123,
    89  		}
    90  
    91  		taskReqKey, err := model.TaskIDToRequestKey(ctx, enqueueTask.Payload.TaskId)
    92  		So(err, ShouldBeNil)
    93  		taskToRun := &model.TaskToRun{
    94  			Key: model.TaskToRunKey(ctx, taskReqKey,
    95  				enqueueTask.Payload.TaskToRunShard,
    96  				enqueueTask.Payload.TaskToRunId,
    97  			),
    98  			Expiration: datastore.NewIndexedOptional(expiry),
    99  		}
   100  		So(datastore.Put(ctx, taskToRun), ShouldBeNil)
   101  
   102  		Convey("handleEnqueueRBETask ok", func() {
   103  			err := srv.handleEnqueueRBETask(ctx, enqueueTask)
   104  			So(err, ShouldBeNil)
   105  
   106  			expectedPayload, _ := anypb.New(&internalspb.TaskPayload{
   107  				ReservationId:  rbeReservation,
   108  				TaskId:         "60b2ed0a43023110",
   109  				TaskToRunShard: 14,
   110  				TaskToRunId:    1,
   111  				DebugInfo: &internalspb.TaskPayload_DebugInfo{
   112  					PySwarmingVersion: "py-version",
   113  					GoSwarmingVersion: "go-version",
   114  				},
   115  			})
   116  
   117  			So(rbe.reservation, ShouldResembleProto, &remoteworkers.Reservation{
   118  				Name:    fmt.Sprintf("%s/reservations/%s", rbeInstance, rbeReservation),
   119  				State:   remoteworkers.ReservationState_RESERVATION_PENDING,
   120  				Payload: expectedPayload,
   121  				Constraints: []*remoteworkers.Constraint{
   122  					{Key: "label:key1", AllowedValues: []string{"v1", "v2"}},
   123  					{Key: "label:key2", AllowedValues: []string{"v3"}},
   124  				},
   125  				ExpireTime:       timestamppb.New(expiry.Add(executionTimeout)),
   126  				QueuingTimeout:   durationpb.New(expirationTimeout),
   127  				ExecutionTimeout: durationpb.New(executionTimeout),
   128  				Priority:         123,
   129  				RequestedBotId:   "some-bot-id",
   130  			})
   131  		})
   132  
   133  		Convey("handleEnqueueRBETask TaskToRun is gone", func() {
   134  			So(datastore.Delete(ctx, datastore.KeyForObj(ctx, taskToRun)), ShouldBeNil)
   135  
   136  			err := srv.handleEnqueueRBETask(ctx, enqueueTask)
   137  			So(err, ShouldBeNil)
   138  
   139  			// Didn't call RBE.
   140  			So(rbe.reservation, ShouldBeNil)
   141  		})
   142  
   143  		Convey("handleEnqueueRBETask TaskToRun is claimed", func() {
   144  			taskToRun.Expiration.Unset()
   145  			So(datastore.Put(ctx, taskToRun), ShouldBeNil)
   146  
   147  			err := srv.handleEnqueueRBETask(ctx, enqueueTask)
   148  			So(err, ShouldBeNil)
   149  
   150  			// Didn't call RBE.
   151  			So(rbe.reservation, ShouldBeNil)
   152  		})
   153  
   154  		Convey("handleEnqueueRBETask transient err", func() {
   155  			rbe.errCreate = status.Errorf(codes.Internal, "boom")
   156  			err := srv.handleEnqueueRBETask(ctx, enqueueTask)
   157  			So(err, ShouldNotBeNil)
   158  			So(transient.Tag.In(err), ShouldBeTrue)
   159  		})
   160  
   161  		Convey("handleEnqueueRBETask already exists", func() {
   162  			rbe.errCreate = status.Errorf(codes.AlreadyExists, "boom")
   163  			err := srv.handleEnqueueRBETask(ctx, enqueueTask)
   164  			So(err, ShouldBeNil)
   165  		})
   166  
   167  		Convey("handleEnqueueRBETask fatal error", func() {
   168  			Convey("expected error, report ok", func() {
   169  				rbe.errCreate = status.Errorf(codes.FailedPrecondition, "boom")
   170  				internals.expireSlice = func(req *internalspb.ExpireSliceRequest) error {
   171  					So(req, ShouldResembleProto, &internalspb.ExpireSliceRequest{
   172  						TaskId:         enqueueTask.Payload.TaskId,
   173  						TaskToRunShard: enqueueTask.Payload.TaskToRunShard,
   174  						TaskToRunId:    enqueueTask.Payload.TaskToRunId,
   175  						Reason:         internalspb.ExpireSliceRequest_NO_RESOURCE,
   176  						Details:        "rpc error: code = FailedPrecondition desc = boom",
   177  					})
   178  					return nil
   179  				}
   180  				err := srv.handleEnqueueRBETask(ctx, enqueueTask)
   181  				So(tq.Ignore.In(err), ShouldBeTrue)
   182  			})
   183  
   184  			Convey("unexpected error, report ok", func() {
   185  				rbe.errCreate = status.Errorf(codes.PermissionDenied, "boom")
   186  				internals.expireSlice = func(req *internalspb.ExpireSliceRequest) error {
   187  					So(req, ShouldResembleProto, &internalspb.ExpireSliceRequest{
   188  						TaskId:         enqueueTask.Payload.TaskId,
   189  						TaskToRunShard: enqueueTask.Payload.TaskToRunShard,
   190  						TaskToRunId:    enqueueTask.Payload.TaskToRunId,
   191  						Reason:         internalspb.ExpireSliceRequest_PERMISSION_DENIED,
   192  						Details:        "rpc error: code = PermissionDenied desc = boom",
   193  					})
   194  					return nil
   195  				}
   196  				err := srv.handleEnqueueRBETask(ctx, enqueueTask)
   197  				So(tq.Fatal.In(err), ShouldBeTrue)
   198  			})
   199  
   200  			Convey("expected, report failed", func() {
   201  				rbe.errCreate = status.Errorf(codes.FailedPrecondition, "boom")
   202  				internals.expireSlice = func(_ *internalspb.ExpireSliceRequest) error {
   203  					return status.Errorf(codes.InvalidArgument, "boom")
   204  				}
   205  				err := srv.handleEnqueueRBETask(ctx, enqueueTask)
   206  				So(err, ShouldNotBeNil)
   207  				So(tq.Ignore.In(err), ShouldBeFalse)
   208  				So(tq.Fatal.In(err), ShouldBeFalse)
   209  			})
   210  		})
   211  
   212  		Convey("handleCancelRBETask ok", func() {
   213  			err := srv.handleCancelRBETask(ctx, &internalspb.CancelRBETask{
   214  				RbeInstance:   rbeInstance,
   215  				ReservationId: rbeReservation,
   216  			})
   217  			So(err, ShouldBeNil)
   218  			So(rbe.lastCancel, ShouldResembleProto, &remoteworkers.CancelReservationRequest{
   219  				Name:   fmt.Sprintf("%s/reservations/%s", rbeInstance, rbeReservation),
   220  				Intent: remoteworkers.CancelReservationIntent_ANY,
   221  			})
   222  		})
   223  
   224  		Convey("handleCancelRBETask not found", func() {
   225  			rbe.errCancel = status.Errorf(codes.NotFound, "boo")
   226  			err := srv.handleCancelRBETask(ctx, &internalspb.CancelRBETask{
   227  				RbeInstance:   rbeInstance,
   228  				ReservationId: rbeReservation,
   229  			})
   230  			So(tq.Ignore.In(err), ShouldBeTrue)
   231  		})
   232  
   233  		Convey("handleCancelRBETask internal", func() {
   234  			rbe.errCancel = status.Errorf(codes.Internal, "boo")
   235  			err := srv.handleCancelRBETask(ctx, &internalspb.CancelRBETask{
   236  				RbeInstance:   rbeInstance,
   237  				ReservationId: rbeReservation,
   238  			})
   239  			So(transient.Tag.In(err), ShouldBeTrue)
   240  		})
   241  
   242  		Convey("ExpireSliceBasedOnReservation", func() {
   243  			const (
   244  				reservationName = "projects/.../instances/.../reservations/..."
   245  				taskSliceIndex  = 1
   246  				taskToRunShard  = 5
   247  				taskToRunID     = 678
   248  				taskID          = "637f8e221100aa10"
   249  			)
   250  
   251  			var (
   252  				expireSliceReason  internalspb.ExpireSliceRequest_Reason
   253  				expireSliceDetails string
   254  			)
   255  			internals.expireSlice = func(r *internalspb.ExpireSliceRequest) error {
   256  				So(r.TaskId, ShouldEqual, taskID)
   257  				So(r.TaskToRunShard, ShouldEqual, taskToRunShard)
   258  				So(r.TaskToRunId, ShouldEqual, taskToRunID)
   259  				So(r.Reason, ShouldNotEqual, internalspb.ExpireSliceRequest_REASON_UNSPECIFIED)
   260  				expireSliceReason = r.Reason
   261  				expireSliceDetails = r.Details
   262  				return nil
   263  			}
   264  
   265  			prepTaskToRun := func(reapable bool) {
   266  				var exp datastore.Optional[time.Time, datastore.Indexed]
   267  				if reapable {
   268  					exp.Set(testclock.TestRecentTimeUTC.Add(time.Hour))
   269  				}
   270  				taskReqKey, _ := model.TaskIDToRequestKey(ctx, taskID)
   271  				So(datastore.Put(ctx, &model.TaskToRun{
   272  					Key:        model.TaskToRunKey(ctx, taskReqKey, taskToRunShard, taskToRunID),
   273  					Expiration: exp,
   274  				}), ShouldBeNil)
   275  			}
   276  
   277  			prepReapableTaskToRun := func() { prepTaskToRun(true) }
   278  			prepClaimedTaskToRun := func() { prepTaskToRun(false) }
   279  
   280  			expireBasedOnReservation := func(state remoteworkers.ReservationState, statusErr error, result *internalspb.TaskResult) {
   281  				rbe.reservation = &remoteworkers.Reservation{
   282  					Name:   reservationName,
   283  					State:  state,
   284  					Status: status.Convert(statusErr).Proto(),
   285  				}
   286  				rbe.reservation.Payload, _ = anypb.New(&internalspb.TaskPayload{
   287  					ReservationId:  "",
   288  					TaskId:         taskID,
   289  					SliceIndex:     taskSliceIndex,
   290  					TaskToRunShard: taskToRunShard,
   291  					TaskToRunId:    taskToRunID,
   292  				})
   293  				if result != nil {
   294  					rbe.reservation.Result, _ = anypb.New(result)
   295  				}
   296  				expireSliceReason = internalspb.ExpireSliceRequest_REASON_UNSPECIFIED
   297  				expireSliceDetails = ""
   298  				So(srv.ExpireSliceBasedOnReservation(ctx, reservationName), ShouldBeNil)
   299  			}
   300  
   301  			expectNoExpireSlice := func() {
   302  				So(expireSliceReason, ShouldEqual, internalspb.ExpireSliceRequest_REASON_UNSPECIFIED)
   303  			}
   304  
   305  			expectExpireSlice := func(r internalspb.ExpireSliceRequest_Reason, details string) {
   306  				So(expireSliceReason, ShouldEqual, r)
   307  				So(expireSliceDetails, ShouldContainSubstring, details)
   308  			}
   309  
   310  			Convey("Still pending", func() {
   311  				prepReapableTaskToRun()
   312  				expireBasedOnReservation(
   313  					remoteworkers.ReservationState_RESERVATION_PENDING,
   314  					nil,
   315  					nil,
   316  				)
   317  				expectNoExpireSlice()
   318  			})
   319  
   320  			Convey("Successful", func() {
   321  				prepClaimedTaskToRun()
   322  				expireBasedOnReservation(
   323  					remoteworkers.ReservationState_RESERVATION_COMPLETED,
   324  					nil,
   325  					&internalspb.TaskResult{},
   326  				)
   327  				expectNoExpireSlice()
   328  			})
   329  
   330  			Convey("Canceled #1", func() {
   331  				prepClaimedTaskToRun()
   332  				expireBasedOnReservation(
   333  					remoteworkers.ReservationState_RESERVATION_COMPLETED,
   334  					status.Errorf(codes.Canceled, "canceled"),
   335  					nil,
   336  				)
   337  				expectNoExpireSlice()
   338  			})
   339  
   340  			Convey("Canceled #2", func() {
   341  				prepClaimedTaskToRun()
   342  				expireBasedOnReservation(
   343  					remoteworkers.ReservationState_RESERVATION_CANCELLED,
   344  					nil,
   345  					nil,
   346  				)
   347  				expectNoExpireSlice()
   348  			})
   349  
   350  			Convey("Expired", func() {
   351  				prepReapableTaskToRun()
   352  				expireBasedOnReservation(
   353  					remoteworkers.ReservationState_RESERVATION_COMPLETED,
   354  					status.Errorf(codes.DeadlineExceeded, "deadline"),
   355  					nil,
   356  				)
   357  				expectExpireSlice(internalspb.ExpireSliceRequest_EXPIRED, "deadline")
   358  			})
   359  
   360  			Convey("No resources", func() {
   361  				prepReapableTaskToRun()
   362  				expireBasedOnReservation(
   363  					remoteworkers.ReservationState_RESERVATION_COMPLETED,
   364  					status.Errorf(codes.FailedPrecondition, "no bots"),
   365  					nil,
   366  				)
   367  				expectExpireSlice(internalspb.ExpireSliceRequest_NO_RESOURCE, "no bots")
   368  			})
   369  
   370  			Convey("Bot internal error", func() {
   371  				prepReapableTaskToRun()
   372  				expireBasedOnReservation(
   373  					remoteworkers.ReservationState_RESERVATION_COMPLETED,
   374  					status.Errorf(codes.DeadlineExceeded, "ignored"),
   375  					&internalspb.TaskResult{BotInternalError: "boom"},
   376  				)
   377  				expectExpireSlice(internalspb.ExpireSliceRequest_BOT_INTERNAL_ERROR, "boom")
   378  			})
   379  
   380  			Convey("Aborted before claimed", func() {
   381  				prepReapableTaskToRun()
   382  				expireBasedOnReservation(
   383  					remoteworkers.ReservationState_RESERVATION_COMPLETED,
   384  					status.Errorf(codes.Aborted, "bot died"),
   385  					nil,
   386  				)
   387  				expectExpireSlice(internalspb.ExpireSliceRequest_BOT_INTERNAL_ERROR, "bot died")
   388  			})
   389  
   390  			Convey("Unexpectedly successful reservations", func() {
   391  				prepReapableTaskToRun()
   392  				expireBasedOnReservation(
   393  					remoteworkers.ReservationState_RESERVATION_COMPLETED,
   394  					nil,
   395  					nil,
   396  				)
   397  				expectExpireSlice(internalspb.ExpireSliceRequest_BOT_INTERNAL_ERROR, "unexpectedly finished")
   398  			})
   399  
   400  			Convey("Unexpectedly canceled reservations", func() {
   401  				prepReapableTaskToRun()
   402  				expireBasedOnReservation(
   403  					remoteworkers.ReservationState_RESERVATION_COMPLETED,
   404  					status.Errorf(codes.Canceled, "ignored"),
   405  					nil,
   406  				)
   407  				expectNoExpireSlice()
   408  			})
   409  
   410  			Convey("Skips already claimed TaskToRun", func() {
   411  				prepClaimedTaskToRun()
   412  				expireBasedOnReservation(
   413  					remoteworkers.ReservationState_RESERVATION_COMPLETED,
   414  					status.Errorf(codes.FailedPrecondition, "no bots"),
   415  					nil,
   416  				)
   417  				expectNoExpireSlice()
   418  			})
   419  
   420  			Convey("Skips missing TaskToRun", func() {
   421  				expireBasedOnReservation(
   422  					remoteworkers.ReservationState_RESERVATION_COMPLETED,
   423  					status.Errorf(codes.FailedPrecondition, "no bots"),
   424  					nil,
   425  				)
   426  				expectNoExpireSlice()
   427  			})
   428  		})
   429  	})
   430  }
   431  
   432  type mockedReservationClient struct {
   433  	lastCreate *remoteworkers.CreateReservationRequest
   434  	lastGet    *remoteworkers.GetReservationRequest
   435  	lastCancel *remoteworkers.CancelReservationRequest
   436  
   437  	errCreate error
   438  	errGet    error
   439  	errCancel error
   440  
   441  	newState    remoteworkers.ReservationState
   442  	reservation *remoteworkers.Reservation
   443  }
   444  
   445  func (m *mockedReservationClient) CreateReservation(ctx context.Context, in *remoteworkers.CreateReservationRequest, opts ...grpc.CallOption) (*remoteworkers.Reservation, error) {
   446  	m.lastCreate = in
   447  	m.reservation = proto.Clone(in.Reservation).(*remoteworkers.Reservation)
   448  	m.reservation.State = m.newState
   449  	if m.errCreate != nil {
   450  		return nil, m.errCreate
   451  	}
   452  	return m.reservation, nil
   453  }
   454  
   455  func (m *mockedReservationClient) GetReservation(ctx context.Context, in *remoteworkers.GetReservationRequest, opts ...grpc.CallOption) (*remoteworkers.Reservation, error) {
   456  	m.lastGet = in
   457  	if m.errGet != nil {
   458  		return nil, m.errGet
   459  	}
   460  	return m.reservation, nil
   461  }
   462  
   463  func (m *mockedReservationClient) CancelReservation(ctx context.Context, in *remoteworkers.CancelReservationRequest, opts ...grpc.CallOption) (*remoteworkers.CancelReservationResponse, error) {
   464  	m.lastCancel = in
   465  	if m.errCancel != nil {
   466  		return nil, m.errCancel
   467  	}
   468  	return &remoteworkers.CancelReservationResponse{}, nil
   469  }
   470  
   471  type mockedInternalsClient struct {
   472  	expireSlice func(*internalspb.ExpireSliceRequest) error
   473  }
   474  
   475  func (m *mockedInternalsClient) ExpireSlice(ctx context.Context, in *internalspb.ExpireSliceRequest, opts ...grpc.CallOption) (*emptypb.Empty, error) {
   476  	if m.expireSlice == nil {
   477  		panic("must not be called")
   478  	}
   479  	if err := m.expireSlice(in); err != nil {
   480  		return nil, err
   481  	}
   482  	return &emptypb.Empty{}, nil
   483  }