go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/swarming/proto/internals/rbe.proto (about) 1 // Copyright 2022 The LUCI Authors. All rights reserved. 2 // Use of this source code is governed under the Apache License, Version 2.0 3 // that can be found in the LICENSE file. 4 5 syntax = "proto3"; 6 7 package swarming.internals.rbe; 8 9 option go_package = "go.chromium.org/luci/swarming/proto/internals;internalspb"; 10 11 import "google/protobuf/duration.proto"; 12 import "google/protobuf/empty.proto"; 13 import "google/protobuf/timestamp.proto"; 14 15 import "go.chromium.org/luci/swarming/proto/config/pools.proto"; 16 17 // TaggedMessage is an envelope for an HMAC-tagged protobuf message. 18 // 19 // A secret key that was used for tagging should be communicated off band (e.g. 20 // stored in some pre-agreed Google Secret Manager secret). 21 // 22 // hmac_sha256 is calculated as: 23 // hmac_sha256("%d" % payload_type + "\n" + payload, secret_key). 24 message TaggedMessage { 25 enum PayloadType { 26 PAYLOAD_TYPE_UNSPECIFIED = 0; 27 POLL_STATE = 1; // payload is wirepb-serialized PollState 28 BOT_SESSION = 2; // payload is wirepb-serialized BotSession 29 } 30 PayloadType payload_type = 1; // identifies the payload proto message type 31 bytes payload = 2; // the serialized payload proto message 32 bytes hmac_sha256 = 3; // HMAC of the payload, see the comment above 33 } 34 35 36 // PollState is produced by the Python server and passed to the Go server. 37 // 38 // It travels wrapped in a TaggedMessage. Once the HMAC tag is verified, the Go 39 // server can trust that this message was generated by the Python server and was 40 // not tampered with. 41 // 42 // It is generated by "/bot/poll" endpoint for bots in the RBE mode. Such bots 43 // then pass it to the "/bot/rbe/..." endpoints. It contains RBE-related 44 // parameters for this particular bot produced based on the bot credentials and 45 // Python server configs. 46 // 47 // It also contains instructions how to authenticate the bot on the Go side. 48 // They are derived based on how the Python server authenticated this particular 49 // bot. They are tightly coupled to swarming.config.BotAuth message. 50 // 51 // Once the Go server verifies the HMAC tag, and checks that the bot presented 52 // the exact same credentials as used by the Python side, the Go side can trust 53 // that this message was not tampered with and was not "substituted" (i.e. 54 // taken from one bot and replayed on another). 55 message PollState { 56 // Unique ID of this message (primary to correlate logs that use it). 57 string id = 1; 58 59 // Expiration time of this message. It should not be used once it expires. 60 google.protobuf.Timestamp expiry = 2; 61 62 // Full RBE instance name to use. 63 string rbe_instance = 3; 64 65 // Override these particular dimensions when contacting RBE. 66 // 67 // These values will be used instead of whatever the bot is reporting. This is 68 // used for security-sensitive dimensions like `id` and `pool`. 69 // 70 // `id` is always populated and has a single value matching the bot ID. 71 message Dimension { 72 string key = 1; 73 repeated string values = 2; 74 } 75 repeated Dimension enforced_dimensions = 4; 76 77 // Information for logs and debugging (not passed to RBE). 78 message DebugInfo { 79 google.protobuf.Timestamp created = 1; // when this message was created 80 string swarming_version = 2; // version of the Swarming code 81 string request_id = 3; // ID of the "/bot/poll" request 82 } 83 DebugInfo debug_info = 5; 84 85 ////////////////////////////////////////////////////////////////////////////// 86 // Instructions for the Go server how to authenticate the bot. 87 88 // If set, use `X-Luci-Gce-Vm-Token` header for auth. 89 // 90 // This header should contain JWT with signed VM metadata with the following 91 // expectations: 92 // * Audience matches https://[*-dot-]<app>.appspot.com 93 // * google.compute_engine.project_id field matches `gce_project`. 94 // * google.compute_engine.instance_name matches `gce_instance`. 95 message GCEAuth { 96 string gce_project = 1; 97 string gce_instance = 2; 98 } 99 100 // If set, use `Authorization` header with OAuth2 access tokens for auth. 101 // 102 // The token should have "https://www.googleapis.com/auth/userinfo.email" 103 // scope and belong to the given service account. 104 message ServiceAccountAuth { 105 string service_account = 1; 106 } 107 108 // If set, use `X-Luci-Machine-Token` header with LUCI machine token. 109 // 110 // The token should have the corresponding FQDN in it. 111 message LUCIMachineTokenAuth { 112 string machine_fqdn = 1; 113 } 114 115 // If set, use only `ip_allowlist` field for auth (see below). 116 message IPAllowlistAuth { 117 // No fields. 118 } 119 120 // If set, the bot should be in the corresponding IP allowlist (in addition 121 // to the primary auth check described by auth_method). Always set if 122 // auth_method is IPAllowlistAuth. 123 string ip_allowlist = 10; 124 125 // Describes how to authenticate the bot. See swarming.config.BotAuth. 126 // 127 // Must be set. 128 oneof auth_method { 129 GCEAuth gce_auth = 11; 130 ServiceAccountAuth service_account_auth = 12; 131 LUCIMachineTokenAuth luci_machine_token_auth = 13; 132 IPAllowlistAuth ip_allowlist_auth = 14; 133 } 134 } 135 136 137 // BotSession carries an RBE bot session ID and the latest validated PollState. 138 // 139 // It travels wrapped in a TaggedMessage. It is produced and verified by the 140 // Go server whenever the bot calls CreateBotSession or UpdateBotSession. 141 // 142 // It serves two purposes: 143 // 1. Protect the RBE bot session ID from tampering by the bot (e.g. prevents 144 // the bot from using a different bot session ID of a bot in another pool). 145 // 2. Preserve parameters of the last known PollState (in particular auth 146 // ones) and bind the session ID to them, so even if the bot has a valid 147 // BotSession token from another bot, it won't be able to use them (because 148 // it will fail the auth check encoded in the PollState). 149 // 150 // The original PollState token has limited expiration time and it expires if 151 // the bot doesn't refresh it by calling the polling endpoint served by the 152 // Python server. When running a long task, the bot is not polling anything and 153 // can't refresh the PollState token. But it still periodically calls 154 // UpdateBotSession to send heartbeats to RBE. This is where PollState stored in 155 // BotSession token is verified and where BotSession token is occasionally 156 // refreshed. 157 // 158 // If a call to UpdateBotSession has both the PollState token and BotSession 159 // token (happens when a bot is polling new tasks from RBE), the information in 160 // the PollState token is used as authoritative since PollState tokens are 161 // generated by Python code based on the freshest state of bot configs. 162 // Fields pulled from such PollState token is used to update BotSession token. 163 message BotSession { 164 // ID of the RBE's BotSession. 165 string rbe_bot_session_id = 1; 166 167 // Poll state extracted from the last seen validated PollState token. 168 // 169 // Its `expiry` should be ignored in favor of `expiry` field in BotSession. 170 PollState poll_state = 2; 171 172 // Expiration time of this message. It should not be used once it expires. 173 google.protobuf.Timestamp expiry = 3; 174 } 175 176 177 // EnqueueRBETask describes payload of `rbe-enqueue` TQ tasks. 178 // 179 // It is submitted into `rbe-enqueue` Cloud Tasks queue by the Python side and 180 // processed by the Go side (resulting in a new RBE reservation on success). 181 message EnqueueRBETask { 182 // Payload of the new RBE reservation. It will eventually be routed to a bot. 183 TaskPayload payload = 1; 184 185 // Fields below are used to decide how to schedule the reservation. Data in 186 // them duplicates immutable data already stored in Datastore, but this data 187 // is potentially hard to get from Go due to use of LocalStructuredProperty so 188 // it is duplicated here. 189 190 // Full RBE instance ID to submit this task to, extracted from TaskRequest. 191 string rbe_instance = 2; 192 // When this particular slice expires, extracted from TaskToRunShard. 193 google.protobuf.Timestamp expiry = 3; 194 // A bot that should execute this slice (if any), extracted from TaskSlice. 195 string requested_bot_id = 4; 196 // Constraints on dimensions reported by a matching bot (ANDed together). 197 message Constraint { 198 // The dimension key e.g. "python_version". 199 string key = 1; 200 // Allowed dimension values to satisfy the constraint, e.g. ["3.8", "3.9"]. 201 repeated string allowed_values = 2; 202 } 203 repeated Constraint constraints = 5; 204 // Swarming task priority, as submitted by the client. 205 int32 priority = 6; 206 // Swarming scheduling algorithm, as specified in pools.cfg. 207 swarming.config.Pool.SchedulingAlgorithm scheduling_algorithm = 7; 208 // How long the task is allowed to run once it starts on the bot. 209 google.protobuf.Duration execution_timeout = 8; 210 } 211 212 213 // CancelRBETask describes payload of `rbe-cancel` TQ tasks. 214 // 215 // It is submitted into `rbe-cancel` Cloud Tasks queue by the Python side and 216 // processed by the Go side (resulting in cancellation of an RBE reservation). 217 message CancelRBETask { 218 // Full RBE instance ID with the reservation, extracted from TaskRequest. 219 string rbe_instance = 1; 220 // Reservation to cancel (scoped to the instance). 221 string reservation_id = 2; 222 223 // Optional information used for debugging and tracing purposes. 224 message DebugInfo { 225 google.protobuf.Timestamp created = 1; // when this message was created 226 string py_swarming_version = 2; // version of the Python Swarming 227 string task_name = 3; // the user-supplied task name FYI 228 } 229 DebugInfo debug_info = 3; 230 } 231 232 233 // TaskPayload is used as an RBE task payload. 234 // 235 // It is serialized as anypb.Any when passed to RBE, and its full proto name 236 // is thus sensitive. 237 // 238 // It points to an existing TaskToRunShardXXX entity representing the pending 239 // request to execute a single task slice plus some extra information useful 240 // for debugging. 241 // 242 // It also contains the name of the RBE reservation that will be created to 243 // represent this task. 244 message TaskPayload { 245 // Unique (within the RBE instance) ID of the reservation, for idempotency. 246 string reservation_id = 1; 247 // Swarming task ID (aka TaskResultSummary packed id), identifies TaskRequest. 248 string task_id = 2; 249 // Task slice index (mostly FYI). 250 int32 slice_index = 3; 251 // Shard index of TaskToRunShardXXX entity class. 252 int32 task_to_run_shard = 4; 253 // Datastore ID of TaskToRunShardXXX entity (a child of the TaskRequest). 254 int64 task_to_run_id = 5; 255 256 // Optional information used for debugging and tracing purposes. 257 message DebugInfo { 258 google.protobuf.Timestamp created = 1; // when this message was created 259 string py_swarming_version = 2; // version of the Python Swarming 260 string go_swarming_version = 3; // version of the Go Swarming 261 string task_name = 4; // the user-supplied task name FYI 262 } 263 DebugInfo debug_info = 6; 264 265 // If true, the bot should not contact Python Swarming, don't execute 266 // anything, just immediately move the reservation into COMPLETED state. 267 // 268 // This is useful during initial development to test RBE task distribution 269 // mechanism in isolation from other Swarming guts. 270 bool noop = 7; 271 } 272 273 274 // TaskResult is used as an RBE task result. 275 // 276 // TaskResult represents an outcome of a reservation that was processed by a bot 277 // (successfully or not). If a bot never saw the reservation, or crashed midway, 278 // TaskResult is not available. There's more generic Reservation.status field 279 // for these cases in the RBE API. 280 // 281 // TaskResult is serialized into anypb.Any when passed to RBE, and its full 282 // proto name is thus sensitive. 283 // 284 // Note that the corresponding TaskPayload is available in the same RBE 285 // Reservation proto that contains TaskResult, so TaskPayload fields are not 286 // duplicated in the TaskResult. 287 message TaskResult { 288 // Set to a human readable string if the bot legitimately skipped executing 289 // the reservation e.g. because it was already claimed. Used for debugging 290 // only. 291 string skip_reason = 1; 292 293 // Set if the bot picked up the reservation, but could not work on it and 294 // gave up. This usually happens if the bot can't claim the TaskToRun after 295 // many attempts. This is an internal Swarming error and it results in the 296 // task failing with BOT_DIED error. 297 string bot_internal_error = 2; 298 } 299 300 301 // This service is exposed by the Python Swarming, called by the Go Swarming. 302 // 303 // All RPCs are internal to the Swarming backend. 304 service Internals { 305 // Marks the slice as expired or failed, switches the task to the next slice. 306 // 307 // Does nothing (and succeeds) if the slice is no longer pending or doesn't 308 // exist. 309 rpc ExpireSlice(ExpireSliceRequest) returns (google.protobuf.Empty); 310 } 311 312 313 // Body of ExpireSlice internal RPC call. 314 // 315 // It identifies a concrete TaskToRunShardXXX entity and the reason it has 316 // expired. 317 message ExpireSliceRequest { 318 // Swarming task ID (aka TaskResultSummary packed id), identifies TaskRequest. 319 string task_id = 1; 320 // Shard index of TaskToRunShardXXX entity class. 321 int32 task_to_run_shard = 2; 322 // Datastore ID of TaskToRunShardXXX entity (a child of the TaskRequest). 323 int64 task_to_run_id = 3; 324 325 // The reason the slice is marked as expired. 326 enum Reason { 327 REASON_UNSPECIFIED = 0; 328 NO_RESOURCE = 1; // no bots alive that match the requested dimensions 329 PERMISSION_DENIED = 2; // no access to the RBE instance 330 INVALID_ARGUMENT = 3; // RBE didn't like something about the reservation 331 BOT_INTERNAL_ERROR = 4; // the bot picked up the reservation and then died 332 EXPIRED = 5; // the scheduling deadline exceeded 333 } 334 Reason reason = 4; 335 string details = 5; 336 }