go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/swarming/proto/internals/rbe.proto (about)

     1  // Copyright 2022 The LUCI Authors. All rights reserved.
     2  // Use of this source code is governed under the Apache License, Version 2.0
     3  // that can be found in the LICENSE file.
     4  
     5  syntax = "proto3";
     6  
     7  package swarming.internals.rbe;
     8  
     9  option go_package = "go.chromium.org/luci/swarming/proto/internals;internalspb";
    10  
    11  import "google/protobuf/duration.proto";
    12  import "google/protobuf/empty.proto";
    13  import "google/protobuf/timestamp.proto";
    14  
    15  import "go.chromium.org/luci/swarming/proto/config/pools.proto";
    16  
    17  // TaggedMessage is an envelope for an HMAC-tagged protobuf message.
    18  //
    19  // A secret key that was used for tagging should be communicated off band (e.g.
    20  // stored in some pre-agreed Google Secret Manager secret).
    21  //
    22  // hmac_sha256 is calculated as:
    23  //   hmac_sha256("%d" % payload_type + "\n" + payload, secret_key).
    24  message TaggedMessage {
    25    enum PayloadType {
    26      PAYLOAD_TYPE_UNSPECIFIED = 0;
    27      POLL_STATE = 1;   // payload is wirepb-serialized PollState
    28      BOT_SESSION = 2;  // payload is wirepb-serialized BotSession
    29    }
    30    PayloadType payload_type = 1;  // identifies the payload proto message type
    31    bytes payload = 2;             // the serialized payload proto message
    32    bytes hmac_sha256 = 3;         // HMAC of the payload, see the comment above
    33  }
    34  
    35  
    36  // PollState is produced by the Python server and passed to the Go server.
    37  //
    38  // It travels wrapped in a TaggedMessage. Once the HMAC tag is verified, the Go
    39  // server can trust that this message was generated by the Python server and was
    40  // not tampered with.
    41  //
    42  // It is generated by "/bot/poll" endpoint for bots in the RBE mode. Such bots
    43  // then pass it to the "/bot/rbe/..." endpoints. It contains RBE-related
    44  // parameters for this particular bot produced based on the bot credentials and
    45  // Python server configs.
    46  //
    47  // It also contains instructions how to authenticate the bot on the Go side.
    48  // They are derived based on how the Python server authenticated this particular
    49  // bot. They are tightly coupled to swarming.config.BotAuth message.
    50  //
    51  // Once the Go server verifies the HMAC tag, and checks that the bot presented
    52  // the exact same credentials as used by the Python side, the Go side can trust
    53  // that this message was not tampered with and was not "substituted" (i.e.
    54  // taken from one bot and replayed on another).
    55  message PollState {
    56    // Unique ID of this message (primary to correlate logs that use it).
    57    string id = 1;
    58  
    59    // Expiration time of this message. It should not be used once it expires.
    60    google.protobuf.Timestamp expiry = 2;
    61  
    62    // Full RBE instance name to use.
    63    string rbe_instance = 3;
    64  
    65    // Override these particular dimensions when contacting RBE.
    66    //
    67    // These values will be used instead of whatever the bot is reporting. This is
    68    // used for security-sensitive dimensions like `id` and `pool`.
    69    //
    70    // `id` is always populated and has a single value matching the bot ID.
    71    message Dimension {
    72      string key = 1;
    73      repeated string values = 2;
    74    }
    75    repeated Dimension enforced_dimensions = 4;
    76  
    77    // Information for logs and debugging (not passed to RBE).
    78    message DebugInfo {
    79      google.protobuf.Timestamp created = 1;  // when this message was created
    80      string swarming_version = 2;            // version of the Swarming code
    81      string request_id = 3;                  // ID of the "/bot/poll" request
    82    }
    83    DebugInfo debug_info = 5;
    84  
    85    //////////////////////////////////////////////////////////////////////////////
    86    // Instructions for the Go server how to authenticate the bot.
    87  
    88    // If set, use `X-Luci-Gce-Vm-Token` header for auth.
    89    //
    90    // This header should contain JWT with signed VM metadata with the following
    91    // expectations:
    92    //   * Audience matches https://[*-dot-]<app>.appspot.com
    93    //   * google.compute_engine.project_id field matches `gce_project`.
    94    //   * google.compute_engine.instance_name matches `gce_instance`.
    95    message GCEAuth {
    96      string gce_project = 1;
    97      string gce_instance = 2;
    98    }
    99  
   100    // If set, use `Authorization` header with OAuth2 access tokens for auth.
   101    //
   102    // The token should have "https://www.googleapis.com/auth/userinfo.email"
   103    // scope and belong to the given service account.
   104    message ServiceAccountAuth {
   105      string service_account = 1;
   106    }
   107  
   108    // If set, use `X-Luci-Machine-Token` header with LUCI machine token.
   109    //
   110    // The token should have the corresponding FQDN in it.
   111    message LUCIMachineTokenAuth {
   112      string machine_fqdn = 1;
   113    }
   114  
   115    // If set, use only `ip_allowlist` field for auth (see below).
   116    message IPAllowlistAuth {
   117      // No fields.
   118    }
   119  
   120    // If set, the bot should be in the corresponding IP allowlist (in addition
   121    // to the primary auth check described by auth_method). Always set if
   122    // auth_method is IPAllowlistAuth.
   123    string ip_allowlist = 10;
   124  
   125    // Describes how to authenticate the bot. See swarming.config.BotAuth.
   126    //
   127    // Must be set.
   128    oneof auth_method {
   129      GCEAuth gce_auth = 11;
   130      ServiceAccountAuth service_account_auth = 12;
   131      LUCIMachineTokenAuth luci_machine_token_auth = 13;
   132      IPAllowlistAuth ip_allowlist_auth = 14;
   133    }
   134  }
   135  
   136  
   137  // BotSession carries an RBE bot session ID and the latest validated PollState.
   138  //
   139  // It travels wrapped in a TaggedMessage. It is produced and verified by the
   140  // Go server whenever the bot calls CreateBotSession or UpdateBotSession.
   141  //
   142  // It serves two purposes:
   143  //   1. Protect the RBE bot session ID from tampering by the bot (e.g. prevents
   144  //      the bot from using a different bot session ID of a bot in another pool).
   145  //   2. Preserve parameters of the last known PollState (in particular auth
   146  //      ones) and bind the session ID to them, so even if the bot has a valid
   147  //      BotSession token from another bot, it won't be able to use them (because
   148  //      it will fail the auth check encoded in the PollState).
   149  //
   150  // The original PollState token has limited expiration time and it expires if
   151  // the bot doesn't refresh it by calling the polling endpoint served by the
   152  // Python server. When running a long task, the bot is not polling anything and
   153  // can't refresh the PollState token. But it still periodically calls
   154  // UpdateBotSession to send heartbeats to RBE. This is where PollState stored in
   155  // BotSession token is verified and where BotSession token is occasionally
   156  // refreshed.
   157  //
   158  // If a call to UpdateBotSession has both the PollState token and BotSession
   159  // token (happens when a bot is polling new tasks from RBE), the information in
   160  // the PollState token is used as authoritative since PollState tokens are
   161  // generated by Python code based on the freshest state of bot configs.
   162  // Fields pulled from such PollState token is used to update BotSession token.
   163  message BotSession {
   164    // ID of the RBE's BotSession.
   165    string rbe_bot_session_id = 1;
   166  
   167    // Poll state extracted from the last seen validated PollState token.
   168    //
   169    // Its `expiry` should be ignored in favor of `expiry` field in BotSession.
   170    PollState poll_state = 2;
   171  
   172    // Expiration time of this message. It should not be used once it expires.
   173    google.protobuf.Timestamp expiry = 3;
   174  }
   175  
   176  
   177  // EnqueueRBETask describes payload of `rbe-enqueue` TQ tasks.
   178  //
   179  // It is submitted into `rbe-enqueue` Cloud Tasks queue by the Python side and
   180  // processed by the Go side (resulting in a new RBE reservation on success).
   181  message EnqueueRBETask {
   182    // Payload of the new RBE reservation. It will eventually be routed to a bot.
   183    TaskPayload payload = 1;
   184  
   185    // Fields below are used to decide how to schedule the reservation. Data in
   186    // them duplicates immutable data already stored in Datastore, but this data
   187    // is potentially hard to get from Go due to use of LocalStructuredProperty so
   188    // it is duplicated here.
   189  
   190    // Full RBE instance ID to submit this task to, extracted from TaskRequest.
   191    string rbe_instance = 2;
   192    // When this particular slice expires, extracted from TaskToRunShard.
   193    google.protobuf.Timestamp expiry = 3;
   194    // A bot that should execute this slice (if any), extracted from TaskSlice.
   195    string requested_bot_id = 4;
   196    // Constraints on dimensions reported by a matching bot (ANDed together).
   197    message Constraint {
   198      // The dimension key e.g. "python_version".
   199      string key = 1;
   200      // Allowed dimension values to satisfy the constraint, e.g. ["3.8", "3.9"].
   201      repeated string allowed_values = 2;
   202    }
   203    repeated Constraint constraints = 5;
   204    // Swarming task priority, as submitted by the client.
   205    int32 priority = 6;
   206    // Swarming scheduling algorithm, as specified in pools.cfg.
   207    swarming.config.Pool.SchedulingAlgorithm scheduling_algorithm = 7;
   208    // How long the task is allowed to run once it starts on the bot.
   209    google.protobuf.Duration execution_timeout = 8;
   210  }
   211  
   212  
   213  // CancelRBETask describes payload of `rbe-cancel` TQ tasks.
   214  //
   215  // It is submitted into `rbe-cancel` Cloud Tasks queue by the Python side and
   216  // processed by the Go side (resulting in cancellation of an RBE reservation).
   217  message CancelRBETask {
   218    // Full RBE instance ID with the reservation, extracted from TaskRequest.
   219    string rbe_instance = 1;
   220    // Reservation to cancel (scoped to the instance).
   221    string reservation_id = 2;
   222  
   223    // Optional information used for debugging and tracing purposes.
   224    message DebugInfo {
   225      google.protobuf.Timestamp created = 1;  // when this message was created
   226      string py_swarming_version = 2;         // version of the Python Swarming
   227      string task_name = 3;                   // the user-supplied task name FYI
   228    }
   229    DebugInfo debug_info = 3;
   230  }
   231  
   232  
   233  // TaskPayload is used as an RBE task payload.
   234  //
   235  // It is serialized as anypb.Any when passed to RBE, and its full proto name
   236  // is thus sensitive.
   237  //
   238  // It points to an existing TaskToRunShardXXX entity representing the pending
   239  // request to execute a single task slice plus some extra information useful
   240  // for debugging.
   241  //
   242  // It also contains the name of the RBE reservation that will be created to
   243  // represent this task.
   244  message TaskPayload {
   245    // Unique (within the RBE instance) ID of the reservation, for idempotency.
   246    string reservation_id = 1;
   247    // Swarming task ID (aka TaskResultSummary packed id), identifies TaskRequest.
   248    string task_id = 2;
   249    // Task slice index (mostly FYI).
   250    int32 slice_index = 3;
   251    // Shard index of TaskToRunShardXXX entity class.
   252    int32 task_to_run_shard = 4;
   253    // Datastore ID of TaskToRunShardXXX entity (a child of the TaskRequest).
   254    int64 task_to_run_id = 5;
   255  
   256    // Optional information used for debugging and tracing purposes.
   257    message DebugInfo {
   258      google.protobuf.Timestamp created = 1;  // when this message was created
   259      string py_swarming_version = 2;         // version of the Python Swarming
   260      string go_swarming_version = 3;         // version of the Go Swarming
   261      string task_name = 4;                   // the user-supplied task name FYI
   262    }
   263    DebugInfo debug_info = 6;
   264  
   265    // If true, the bot should not contact Python Swarming, don't execute
   266    // anything, just immediately move the reservation into COMPLETED state.
   267    //
   268    // This is useful during initial development to test RBE task distribution
   269    // mechanism in isolation from other Swarming guts.
   270    bool noop = 7;
   271  }
   272  
   273  
   274  // TaskResult is used as an RBE task result.
   275  //
   276  // TaskResult represents an outcome of a reservation that was processed by a bot
   277  // (successfully or not). If a bot never saw the reservation, or crashed midway,
   278  // TaskResult is not available. There's more generic Reservation.status field
   279  // for these cases in the RBE API.
   280  //
   281  // TaskResult is serialized into anypb.Any when passed to RBE, and its full
   282  // proto name is thus sensitive.
   283  //
   284  // Note that the corresponding TaskPayload is available in the same RBE
   285  // Reservation proto that contains TaskResult, so TaskPayload fields are not
   286  // duplicated in the TaskResult.
   287  message TaskResult {
   288    // Set to a human readable string if the bot legitimately skipped executing
   289    // the reservation e.g. because it was already claimed. Used for debugging
   290    // only.
   291    string skip_reason = 1;
   292  
   293    // Set if the bot picked up the reservation, but could not work on it and
   294    // gave up. This usually happens if the bot can't claim the TaskToRun after
   295    // many attempts. This is an internal Swarming error and it results in the
   296    // task failing with BOT_DIED error.
   297    string bot_internal_error = 2;
   298  }
   299  
   300  
   301  // This service is exposed by the Python Swarming, called by the Go Swarming.
   302  //
   303  // All RPCs are internal to the Swarming backend.
   304  service Internals {
   305    // Marks the slice as expired or failed, switches the task to the next slice.
   306    //
   307    // Does nothing (and succeeds) if the slice is no longer pending or doesn't
   308    // exist.
   309    rpc ExpireSlice(ExpireSliceRequest) returns (google.protobuf.Empty);
   310  }
   311  
   312  
   313  // Body of ExpireSlice internal RPC call.
   314  //
   315  // It identifies a concrete TaskToRunShardXXX entity and the reason it has
   316  // expired.
   317  message ExpireSliceRequest {
   318    // Swarming task ID (aka TaskResultSummary packed id), identifies TaskRequest.
   319    string task_id = 1;
   320    // Shard index of TaskToRunShardXXX entity class.
   321    int32 task_to_run_shard = 2;
   322    // Datastore ID of TaskToRunShardXXX entity (a child of the TaskRequest).
   323    int64 task_to_run_id = 3;
   324  
   325    // The reason the slice is marked as expired.
   326    enum Reason {
   327      REASON_UNSPECIFIED = 0;
   328      NO_RESOURCE = 1;        // no bots alive that match the requested dimensions
   329      PERMISSION_DENIED = 2;  // no access to the RBE instance
   330      INVALID_ARGUMENT = 3;   // RBE didn't like something about the reservation
   331      BOT_INTERNAL_ERROR = 4; // the bot picked up the reservation and then died
   332      EXPIRED = 5;            // the scheduling deadline exceeded
   333    }
   334    Reason reason = 4;
   335    string details = 5;
   336  }