go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/swarming/server/rpcs/bots_count_bots.go (about) 1 // Copyright 2024 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package rpcs 16 17 import ( 18 "context" 19 20 "golang.org/x/sync/errgroup" 21 "google.golang.org/grpc/codes" 22 "google.golang.org/grpc/status" 23 "google.golang.org/protobuf/types/known/timestamppb" 24 25 "go.chromium.org/luci/common/clock" 26 "go.chromium.org/luci/common/errors" 27 "go.chromium.org/luci/common/logging" 28 "go.chromium.org/luci/gae/service/datastore" 29 30 apipb "go.chromium.org/luci/swarming/proto/api_v2" 31 "go.chromium.org/luci/swarming/server/acls" 32 "go.chromium.org/luci/swarming/server/model" 33 ) 34 35 // CountBots implements the corresponding RPC method. 36 func (srv *BotsServer) CountBots(ctx context.Context, req *apipb.BotsCountRequest) (*apipb.BotsCount, error) { 37 dims, err := model.NewFilter(req.Dimensions) 38 if err != nil { 39 return nil, status.Errorf(codes.InvalidArgument, "invalid dimensions: %s", err) 40 } 41 42 // If the query is restricted to some set of pools, need the permission in all 43 // of them. Otherwise need the global server permissions, since an 44 // unrestricted query can return bots from any pool. 45 var res acls.CheckResult 46 state := State(ctx) 47 if pools := dims.Pools(); len(pools) != 0 { 48 res = state.ACL.CheckAllPoolsPerm(ctx, pools, acls.PermPoolsListBots) 49 } else { 50 res = state.ACL.CheckServerPerm(ctx, acls.PermPoolsListBots) 51 } 52 if !res.Permitted { 53 return nil, res.ToGrpcErr() 54 } 55 56 out := &apipb.BotsCount{} 57 58 // State filter => where to put the final count of bots matching it. 59 type perStateQuery struct { 60 filter model.StateFilter 61 res *int32 62 } 63 perState := []perStateQuery{ 64 {model.StateFilter{}, &out.Count}, 65 {model.StateFilter{Quarantined: apipb.NullableBool_TRUE}, &out.Quarantined}, 66 {model.StateFilter{InMaintenance: apipb.NullableBool_TRUE}, &out.Maintenance}, 67 {model.StateFilter{IsDead: apipb.NullableBool_TRUE}, &out.Dead}, 68 {model.StateFilter{IsBusy: apipb.NullableBool_TRUE}, &out.Busy}, 69 } 70 71 queries := model.FilterBotsByDimensions(model.BotInfoQuery(), srv.BotQuerySplitMode, dims) 72 73 // If there's only one query, we can upgrade it into an aggregation query and 74 // do counting completely on the datastore side. We can't do that though if we 75 // need to merge multiple queries with OR operator, since we won't be able to 76 // avoid double counting entities that match multiple subqueries at the same 77 // time. In that case we will need to manually run multiple keys-only regular 78 // queries and merge their results locally (by counting only unique keys). 79 // 80 // Note that it is tempting to run a single projection query on `composite` 81 // repeated field and group bots by state locally (since we need to visit them 82 // all anyway to count the total number of bots). But such query surprisingly 83 // returns 4x results: one entity per individual value of `composite` field 84 // (and not one entity with 4 values in repeated `composite` field as one 85 // would expect). So such query is actually ~4x slower in terms of wall clock 86 // time than running 5 per-state keys-only queries in parallel (like we do 87 // below). 88 useAggregation := len(queries) == 1 89 90 // Manually running many queries and merging their results can take a lot of 91 // time (up to a minute in some cases). At least do it transactionally to get 92 // a consistent snapshot of the state, since otherwise it can drift quite a 93 // bit between individual queries. This appears to have no noticeable impact 94 // on performance in production (the queries are equally slow either way). 95 // 96 // If we are going to use aggregation queries, run them non-transactionally, 97 // since they are not supported in transactions. They are usually super fast 98 // (under a second, even when counting a lot of bots) and the state doesn't 99 // change much during such a short interval. We can tolerate "eventual 100 // consistency" for such huge performance wins. Swarming CountBots API never 101 // promised to be strongly consistent anyway (and never was before). 102 err = maybeTxn(ctx, !useAggregation, func(ctx context.Context) error { 103 eg, ectx := errgroup.WithContext(ctx) 104 for _, subq := range perState { 105 filter, res := subq.filter, subq.res 106 eg.Go(func() error { 107 var count int64 108 var err error 109 if useAggregation { 110 // Note: len(queries) == 1 here, queries[0] is the only query to run. 111 count, err = datastore.Count(ectx, model.FilterBotsByState(queries[0], filter).EventualConsistency(true)) 112 } else { 113 // Apply the filter, enable firestore mode to run in the transaction. 114 filtered := make([]*datastore.Query, len(queries)) 115 for i, q := range queries { 116 filtered[i] = model.FilterBotsByState(q, filter).EventualConsistency(false).FirestoreMode(true) 117 } 118 count, err = datastore.CountMulti(ectx, filtered) 119 } 120 if err != nil { 121 if !errors.Is(err, context.Canceled) { 122 logging.Errorf(ctx, "Error in BotInfo query with filter %v: %s", filter, err) 123 } 124 return err 125 } 126 *res = int32(count) 127 return nil 128 }) 129 } 130 return eg.Wait() 131 }) 132 if err != nil { 133 return nil, status.Errorf(codes.Internal, "datastore error counting bots") 134 } 135 136 out.Now = timestamppb.New(clock.Now(ctx)) 137 return out, nil 138 } 139 140 // maybeTxn runs the callback either transactionally or not, depending on `txn`. 141 func maybeTxn(ctx context.Context, txn bool, cb func(ctx context.Context) error) error { 142 if txn { 143 return datastore.RunInTransaction(ctx, cb, &datastore.TransactionOptions{ReadOnly: true}) 144 } 145 return cb(ctx) 146 }