go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/swarming/server/rpcs/bots_count_bots.go (about)

     1  // Copyright 2024 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package rpcs
    16  
    17  import (
    18  	"context"
    19  
    20  	"golang.org/x/sync/errgroup"
    21  	"google.golang.org/grpc/codes"
    22  	"google.golang.org/grpc/status"
    23  	"google.golang.org/protobuf/types/known/timestamppb"
    24  
    25  	"go.chromium.org/luci/common/clock"
    26  	"go.chromium.org/luci/common/errors"
    27  	"go.chromium.org/luci/common/logging"
    28  	"go.chromium.org/luci/gae/service/datastore"
    29  
    30  	apipb "go.chromium.org/luci/swarming/proto/api_v2"
    31  	"go.chromium.org/luci/swarming/server/acls"
    32  	"go.chromium.org/luci/swarming/server/model"
    33  )
    34  
    35  // CountBots implements the corresponding RPC method.
    36  func (srv *BotsServer) CountBots(ctx context.Context, req *apipb.BotsCountRequest) (*apipb.BotsCount, error) {
    37  	dims, err := model.NewFilter(req.Dimensions)
    38  	if err != nil {
    39  		return nil, status.Errorf(codes.InvalidArgument, "invalid dimensions: %s", err)
    40  	}
    41  
    42  	// If the query is restricted to some set of pools, need the permission in all
    43  	// of them. Otherwise need the global server permissions, since an
    44  	// unrestricted query can return bots from any pool.
    45  	var res acls.CheckResult
    46  	state := State(ctx)
    47  	if pools := dims.Pools(); len(pools) != 0 {
    48  		res = state.ACL.CheckAllPoolsPerm(ctx, pools, acls.PermPoolsListBots)
    49  	} else {
    50  		res = state.ACL.CheckServerPerm(ctx, acls.PermPoolsListBots)
    51  	}
    52  	if !res.Permitted {
    53  		return nil, res.ToGrpcErr()
    54  	}
    55  
    56  	out := &apipb.BotsCount{}
    57  
    58  	// State filter => where to put the final count of bots matching it.
    59  	type perStateQuery struct {
    60  		filter model.StateFilter
    61  		res    *int32
    62  	}
    63  	perState := []perStateQuery{
    64  		{model.StateFilter{}, &out.Count},
    65  		{model.StateFilter{Quarantined: apipb.NullableBool_TRUE}, &out.Quarantined},
    66  		{model.StateFilter{InMaintenance: apipb.NullableBool_TRUE}, &out.Maintenance},
    67  		{model.StateFilter{IsDead: apipb.NullableBool_TRUE}, &out.Dead},
    68  		{model.StateFilter{IsBusy: apipb.NullableBool_TRUE}, &out.Busy},
    69  	}
    70  
    71  	queries := model.FilterBotsByDimensions(model.BotInfoQuery(), srv.BotQuerySplitMode, dims)
    72  
    73  	// If there's only one query, we can upgrade it into an aggregation query and
    74  	// do counting completely on the datastore side. We can't do that though if we
    75  	// need to merge multiple queries with OR operator, since we won't be able to
    76  	// avoid double counting entities that match multiple subqueries at the same
    77  	// time. In that case we will need to manually run multiple keys-only regular
    78  	// queries and merge their results locally (by counting only unique keys).
    79  	//
    80  	// Note that it is tempting to run a single projection query on `composite`
    81  	// repeated field and group bots by state locally (since we need to visit them
    82  	// all anyway to count the total number of bots). But such query surprisingly
    83  	// returns 4x results: one entity per individual value of `composite` field
    84  	// (and not one entity with 4 values in repeated `composite` field as one
    85  	// would expect). So such query is actually ~4x slower in terms of wall clock
    86  	// time than running 5 per-state keys-only queries in parallel (like we do
    87  	// below).
    88  	useAggregation := len(queries) == 1
    89  
    90  	// Manually running many queries and merging their results can take a lot of
    91  	// time (up to a minute in some cases). At least do it transactionally to get
    92  	// a consistent snapshot of the state, since otherwise it can drift quite a
    93  	// bit between individual queries. This appears to have no noticeable impact
    94  	// on performance in production (the queries are equally slow either way).
    95  	//
    96  	// If we are going to use aggregation queries, run them non-transactionally,
    97  	// since they are not supported in transactions. They are usually super fast
    98  	// (under a second, even when counting a lot of bots) and the state doesn't
    99  	// change much during such a short interval. We can tolerate "eventual
   100  	// consistency" for such huge performance wins. Swarming CountBots API never
   101  	// promised to be strongly consistent anyway (and never was before).
   102  	err = maybeTxn(ctx, !useAggregation, func(ctx context.Context) error {
   103  		eg, ectx := errgroup.WithContext(ctx)
   104  		for _, subq := range perState {
   105  			filter, res := subq.filter, subq.res
   106  			eg.Go(func() error {
   107  				var count int64
   108  				var err error
   109  				if useAggregation {
   110  					// Note: len(queries) == 1 here, queries[0] is the only query to run.
   111  					count, err = datastore.Count(ectx, model.FilterBotsByState(queries[0], filter).EventualConsistency(true))
   112  				} else {
   113  					// Apply the filter, enable firestore mode to run in the transaction.
   114  					filtered := make([]*datastore.Query, len(queries))
   115  					for i, q := range queries {
   116  						filtered[i] = model.FilterBotsByState(q, filter).EventualConsistency(false).FirestoreMode(true)
   117  					}
   118  					count, err = datastore.CountMulti(ectx, filtered)
   119  				}
   120  				if err != nil {
   121  					if !errors.Is(err, context.Canceled) {
   122  						logging.Errorf(ctx, "Error in BotInfo query with filter %v: %s", filter, err)
   123  					}
   124  					return err
   125  				}
   126  				*res = int32(count)
   127  				return nil
   128  			})
   129  		}
   130  		return eg.Wait()
   131  	})
   132  	if err != nil {
   133  		return nil, status.Errorf(codes.Internal, "datastore error counting bots")
   134  	}
   135  
   136  	out.Now = timestamppb.New(clock.Now(ctx))
   137  	return out, nil
   138  }
   139  
   140  // maybeTxn runs the callback either transactionally or not, depending on `txn`.
   141  func maybeTxn(ctx context.Context, txn bool, cb func(ctx context.Context) error) error {
   142  	if txn {
   143  		return datastore.RunInTransaction(ctx, cb, &datastore.TransactionOptions{ReadOnly: true})
   144  	}
   145  	return cb(ctx)
   146  }