github.com/hernad/nomad@v1.6.112/nomad/client_alloc_endpoint.go (about) 1 // Copyright (c) HashiCorp, Inc. 2 // SPDX-License-Identifier: MPL-2.0 3 4 package nomad 5 6 import ( 7 "errors" 8 "fmt" 9 "io" 10 "net" 11 "time" 12 13 "github.com/armon/go-metrics" 14 "github.com/hashicorp/go-hclog" 15 "github.com/hashicorp/go-msgpack/codec" 16 17 "github.com/hernad/nomad/acl" 18 cstructs "github.com/hernad/nomad/client/structs" 19 "github.com/hernad/nomad/helper/pointer" 20 "github.com/hernad/nomad/nomad/structs" 21 ) 22 23 // ClientAllocations is used to forward RPC requests to the targeted Nomad client's 24 // Allocation endpoint. 25 type ClientAllocations struct { 26 srv *Server 27 logger hclog.Logger 28 } 29 30 func NewClientAllocationsEndpoint(srv *Server) *ClientAllocations { 31 return &ClientAllocations{srv: srv, logger: srv.logger.Named("client_allocs")} 32 } 33 34 func (a *ClientAllocations) register() { 35 a.srv.streamingRpcs.Register("Allocations.Exec", a.exec) 36 } 37 38 // GarbageCollectAll is used to garbage collect all allocations on a client. 39 func (a *ClientAllocations) GarbageCollectAll(args *structs.NodeSpecificRequest, reply *structs.GenericResponse) error { 40 // We only allow stale reads since the only potentially stale information is 41 // the Node registration and the cost is fairly high for adding another hop 42 // in the forwarding chain. 43 args.QueryOptions.AllowStale = true 44 45 authErr := a.srv.Authenticate(nil, args) 46 47 // Potentially forward to a different region. 48 if done, err := a.srv.forward("ClientAllocations.GarbageCollectAll", args, args, reply); done { 49 return err 50 } 51 a.srv.MeasureRPCRate("client_allocations", structs.RateMetricWrite, args) 52 if authErr != nil { 53 return structs.ErrPermissionDenied 54 } 55 defer metrics.MeasureSince([]string{"nomad", "client_allocations", "garbage_collect_all"}, time.Now()) 56 57 // Check node read permissions 58 if aclObj, err := a.srv.ResolveACL(args); err != nil { 59 return err 60 } else if aclObj != nil && !aclObj.AllowNodeWrite() { 61 return structs.ErrPermissionDenied 62 } 63 64 // Verify the arguments. 65 if args.NodeID == "" { 66 return errors.New("missing NodeID") 67 } 68 69 // Make sure Node is valid and new enough to support RPC 70 snap, err := a.srv.State().Snapshot() 71 if err != nil { 72 return err 73 } 74 75 _, err = getNodeForRpc(snap, args.NodeID) 76 if err != nil { 77 return err 78 } 79 80 // Get the connection to the client 81 state, ok := a.srv.getNodeConn(args.NodeID) 82 if !ok { 83 return findNodeConnAndForward(a.srv, args.NodeID, "ClientAllocations.GarbageCollectAll", args, reply) 84 } 85 86 // Make the RPC 87 return NodeRpc(state.Session, "Allocations.GarbageCollectAll", args, reply) 88 } 89 90 // Signal is used to send a signal to an allocation on a client. 91 func (a *ClientAllocations) Signal(args *structs.AllocSignalRequest, reply *structs.GenericResponse) error { 92 // We only allow stale reads since the only potentially stale information is 93 // the Node registration and the cost is fairly high for adding another hope 94 // in the forwarding chain. 95 args.QueryOptions.AllowStale = true 96 97 authErr := a.srv.Authenticate(nil, args) 98 99 // Potentially forward to a different region. 100 if done, err := a.srv.forward("ClientAllocations.Signal", args, args, reply); done { 101 return err 102 } 103 a.srv.MeasureRPCRate("client_allocations", structs.RateMetricWrite, args) 104 if authErr != nil { 105 return structs.ErrPermissionDenied 106 } 107 defer metrics.MeasureSince([]string{"nomad", "client_allocations", "signal"}, time.Now()) 108 109 // Verify the arguments. 110 if args.AllocID == "" { 111 return errors.New("missing AllocID") 112 } 113 114 // Find the allocation 115 snap, err := a.srv.State().Snapshot() 116 if err != nil { 117 return err 118 } 119 120 alloc, err := getAlloc(snap, args.AllocID) 121 if err != nil { 122 return err 123 } 124 125 // Check namespace alloc-lifecycle permission. 126 if aclObj, err := a.srv.ResolveACL(args); err != nil { 127 return err 128 } else if aclObj != nil && !aclObj.AllowNsOp(alloc.Namespace, acl.NamespaceCapabilityAllocLifecycle) { 129 return structs.ErrPermissionDenied 130 } 131 132 // Make sure Node is valid and new enough to support RPC 133 _, err = getNodeForRpc(snap, alloc.NodeID) 134 if err != nil { 135 return err 136 } 137 138 // Get the connection to the client 139 state, ok := a.srv.getNodeConn(alloc.NodeID) 140 if !ok { 141 return findNodeConnAndForward(a.srv, alloc.NodeID, "ClientAllocations.Signal", args, reply) 142 } 143 144 // Make the RPC 145 return NodeRpc(state.Session, "Allocations.Signal", args, reply) 146 } 147 148 // GarbageCollect is used to garbage collect an allocation on a client. 149 func (a *ClientAllocations) GarbageCollect(args *structs.AllocSpecificRequest, reply *structs.GenericResponse) error { 150 // We only allow stale reads since the only potentially stale information is 151 // the Node registration and the cost is fairly high for adding another hop 152 // in the forwarding chain. 153 args.QueryOptions.AllowStale = true 154 155 authErr := a.srv.Authenticate(nil, args) 156 157 // Potentially forward to a different region. 158 if done, err := a.srv.forward("ClientAllocations.GarbageCollect", args, args, reply); done { 159 return err 160 } 161 a.srv.MeasureRPCRate("client_allocations", structs.RateMetricWrite, args) 162 if authErr != nil { 163 return structs.ErrPermissionDenied 164 } 165 defer metrics.MeasureSince([]string{"nomad", "client_allocations", "garbage_collect"}, time.Now()) 166 167 // Verify the arguments. 168 if args.AllocID == "" { 169 return errors.New("missing AllocID") 170 } 171 172 // Find the allocation 173 snap, err := a.srv.State().Snapshot() 174 if err != nil { 175 return err 176 } 177 178 alloc, err := getAlloc(snap, args.AllocID) 179 if err != nil { 180 return err 181 } 182 183 // Check namespace submit-job permission. 184 if aclObj, err := a.srv.ResolveACL(args); err != nil { 185 return err 186 } else if aclObj != nil && !aclObj.AllowNsOp(alloc.Namespace, acl.NamespaceCapabilitySubmitJob) { 187 return structs.ErrPermissionDenied 188 } 189 190 // Make sure Node is valid and new enough to support RPC 191 _, err = getNodeForRpc(snap, alloc.NodeID) 192 if err != nil { 193 return err 194 } 195 196 // Get the connection to the client 197 state, ok := a.srv.getNodeConn(alloc.NodeID) 198 if !ok { 199 return findNodeConnAndForward(a.srv, alloc.NodeID, "ClientAllocations.GarbageCollect", args, reply) 200 } 201 202 // Make the RPC 203 return NodeRpc(state.Session, "Allocations.GarbageCollect", args, reply) 204 } 205 206 // Restart is used to trigger a restart of an allocation or a subtask on a client. 207 func (a *ClientAllocations) Restart(args *structs.AllocRestartRequest, reply *structs.GenericResponse) error { 208 // We only allow stale reads since the only potentially stale information is 209 // the Node registration and the cost is fairly high for adding another hop 210 // in the forwarding chain. 211 args.QueryOptions.AllowStale = true 212 213 authErr := a.srv.Authenticate(nil, args) 214 215 // Potentially forward to a different region. 216 if done, err := a.srv.forward("ClientAllocations.Restart", args, args, reply); done { 217 return err 218 } 219 a.srv.MeasureRPCRate("client_allocations", structs.RateMetricWrite, args) 220 if authErr != nil { 221 return structs.ErrPermissionDenied 222 } 223 defer metrics.MeasureSince([]string{"nomad", "client_allocations", "restart"}, time.Now()) 224 225 // Find the allocation 226 snap, err := a.srv.State().Snapshot() 227 if err != nil { 228 return err 229 } 230 231 alloc, err := getAlloc(snap, args.AllocID) 232 if err != nil { 233 return err 234 } 235 236 // Check for namespace alloc-lifecycle permissions. 237 if aclObj, err := a.srv.ResolveACL(args); err != nil { 238 return err 239 } else if aclObj != nil && !aclObj.AllowNsOp(alloc.Namespace, acl.NamespaceCapabilityAllocLifecycle) { 240 return structs.ErrPermissionDenied 241 } 242 243 // Make sure Node is valid and new enough to support RPC 244 _, err = getNodeForRpc(snap, alloc.NodeID) 245 if err != nil { 246 return err 247 } 248 249 // Get the connection to the client 250 state, ok := a.srv.getNodeConn(alloc.NodeID) 251 if !ok { 252 return findNodeConnAndForward(a.srv, alloc.NodeID, "ClientAllocations.Restart", args, reply) 253 } 254 255 // Make the RPC 256 return NodeRpc(state.Session, "Allocations.Restart", args, reply) 257 } 258 259 // Stats is used to collect allocation statistics 260 func (a *ClientAllocations) Stats(args *cstructs.AllocStatsRequest, reply *cstructs.AllocStatsResponse) error { 261 // We only allow stale reads since the only potentially stale information is 262 // the Node registration and the cost is fairly high for adding another hop 263 // in the forwarding chain. 264 args.QueryOptions.AllowStale = true 265 266 authErr := a.srv.Authenticate(nil, args) 267 268 // Potentially forward to a different region. 269 if done, err := a.srv.forward("ClientAllocations.Stats", args, args, reply); done { 270 return err 271 } 272 a.srv.MeasureRPCRate("client_allocations", structs.RateMetricRead, args) 273 if authErr != nil { 274 return structs.ErrPermissionDenied 275 } 276 defer metrics.MeasureSince([]string{"nomad", "client_allocations", "stats"}, time.Now()) 277 278 // Find the allocation 279 snap, err := a.srv.State().Snapshot() 280 if err != nil { 281 return err 282 } 283 284 alloc, err := getAlloc(snap, args.AllocID) 285 if err != nil { 286 return err 287 } 288 289 // Check for namespace read-job permissions. 290 if aclObj, err := a.srv.ResolveACL(args); err != nil { 291 return err 292 } else if aclObj != nil && !aclObj.AllowNsOp(alloc.Namespace, acl.NamespaceCapabilityReadJob) { 293 return structs.ErrPermissionDenied 294 } 295 296 // Make sure Node is valid and new enough to support RPC 297 _, err = getNodeForRpc(snap, alloc.NodeID) 298 if err != nil { 299 return err 300 } 301 302 // Get the connection to the client 303 state, ok := a.srv.getNodeConn(alloc.NodeID) 304 if !ok { 305 return findNodeConnAndForward(a.srv, alloc.NodeID, "ClientAllocations.Stats", args, reply) 306 } 307 308 // Make the RPC 309 return NodeRpc(state.Session, "Allocations.Stats", args, reply) 310 } 311 312 // Checks is the server implementation of the allocation checks RPC. The 313 // ultimate response is provided by the node running the allocation. This RPC 314 // is needed to handle queries which hit the server agent API directly, or via 315 // another node which is not running the allocation. 316 func (a *ClientAllocations) Checks(args *cstructs.AllocChecksRequest, reply *cstructs.AllocChecksResponse) error { 317 318 // We only allow stale reads since the only potentially stale information 319 // is the Node registration and the cost is fairly high for adding another 320 // hop in the forwarding chain. 321 args.QueryOptions.AllowStale = true 322 323 authErr := a.srv.Authenticate(nil, args) 324 325 // Potentially forward to a different region. 326 if done, err := a.srv.forward("ClientAllocations.Checks", args, args, reply); done { 327 return err 328 } 329 a.srv.MeasureRPCRate("client_allocations", structs.RateMetricRead, args) 330 if authErr != nil { 331 return structs.ErrPermissionDenied 332 } 333 defer metrics.MeasureSince([]string{"nomad", "client_allocations", "checks"}, time.Now()) 334 335 // Grab the state snapshot, as we need this to perform lookups for a number 336 // of objects, all things being well. 337 snap, err := a.srv.State().Snapshot() 338 if err != nil { 339 return err 340 } 341 342 // Get the full allocation object, so we have information such as the 343 // namespace and node ID. 344 alloc, err := getAlloc(snap, args.AllocID) 345 if err != nil { 346 return err 347 } 348 349 // Check for namespace read-job permissions. 350 if aclObj, err := a.srv.ResolveACL(args); err != nil { 351 return err 352 } else if aclObj != nil && !aclObj.AllowNsOp(alloc.Namespace, acl.NamespaceCapabilityReadJob) { 353 return structs.ErrPermissionDenied 354 } 355 356 // Make sure Node is valid and new enough to support RPC. 357 if _, err = getNodeForRpc(snap, alloc.NodeID); err != nil { 358 return err 359 } 360 361 // Get the connection to the client. 362 state, ok := a.srv.getNodeConn(alloc.NodeID) 363 if !ok { 364 return findNodeConnAndForward(a.srv, alloc.NodeID, "ClientAllocations.Checks", args, reply) 365 } 366 367 // Make the RPC 368 return NodeRpc(state.Session, "Allocations.Checks", args, reply) 369 } 370 371 // exec is used to execute command in a running task 372 func (a *ClientAllocations) exec(conn io.ReadWriteCloser) { 373 defer conn.Close() 374 defer metrics.MeasureSince([]string{"nomad", "alloc", "exec"}, time.Now()) 375 376 // Decode the arguments 377 var args cstructs.AllocExecRequest 378 decoder := codec.NewDecoder(conn, structs.MsgpackHandle) 379 encoder := codec.NewEncoder(conn, structs.MsgpackHandle) 380 381 if err := decoder.Decode(&args); err != nil { 382 handleStreamResultError(err, pointer.Of(int64(500)), encoder) 383 return 384 } 385 386 authErr := a.srv.Authenticate(nil, &args) 387 388 // Check if we need to forward to a different region 389 if r := args.RequestRegion(); r != a.srv.Region() { 390 forwardRegionStreamingRpc(a.srv, conn, encoder, &args, "Allocations.Exec", 391 args.AllocID, &args.QueryOptions) 392 return 393 } 394 a.srv.MeasureRPCRate("client_allocations", structs.RateMetricWrite, &args) 395 if authErr != nil { 396 handleStreamResultError(structs.ErrPermissionDenied, nil, encoder) 397 return 398 } 399 400 // Verify the arguments. 401 if args.AllocID == "" { 402 handleStreamResultError(errors.New("missing AllocID"), pointer.Of(int64(400)), encoder) 403 return 404 } 405 406 // Retrieve the allocation 407 snap, err := a.srv.State().Snapshot() 408 if err != nil { 409 handleStreamResultError(err, nil, encoder) 410 return 411 } 412 413 alloc, err := getAlloc(snap, args.AllocID) 414 if structs.IsErrUnknownAllocation(err) { 415 handleStreamResultError(err, pointer.Of(int64(404)), encoder) 416 return 417 } 418 if err != nil { 419 handleStreamResultError(err, nil, encoder) 420 return 421 } 422 423 // Check node read permissions 424 if aclObj, err := a.srv.ResolveACL(&args); err != nil { 425 handleStreamResultError(err, nil, encoder) 426 return 427 } else if aclObj != nil && !aclObj.AllowNsOp(alloc.Namespace, acl.NamespaceCapabilityAllocExec) { 428 // client ultimately checks if AllocNodeExec is required 429 handleStreamResultError(structs.ErrPermissionDenied, nil, encoder) 430 return 431 } 432 433 nodeID := alloc.NodeID 434 435 // Make sure Node is valid and new enough to support RPC 436 node, err := snap.NodeByID(nil, nodeID) 437 if err != nil { 438 handleStreamResultError(err, pointer.Of(int64(500)), encoder) 439 return 440 } 441 442 if node == nil { 443 err := fmt.Errorf("Unknown node %q", nodeID) 444 handleStreamResultError(err, pointer.Of(int64(400)), encoder) 445 return 446 } 447 448 if err := nodeSupportsRpc(node); err != nil { 449 handleStreamResultError(err, pointer.Of(int64(400)), encoder) 450 return 451 } 452 453 // Get the connection to the client either by forwarding to another server 454 // or creating a direct stream 455 var clientConn net.Conn 456 state, ok := a.srv.getNodeConn(nodeID) 457 if !ok { 458 // Determine the Server that has a connection to the node. 459 srv, err := a.srv.serverWithNodeConn(nodeID, a.srv.Region()) 460 if err != nil { 461 var code *int64 462 if structs.IsErrNoNodeConn(err) { 463 code = pointer.Of(int64(404)) 464 } 465 handleStreamResultError(err, code, encoder) 466 return 467 } 468 469 // Get a connection to the server 470 conn, err := a.srv.streamingRpc(srv, "Allocations.Exec") 471 if err != nil { 472 handleStreamResultError(err, nil, encoder) 473 return 474 } 475 476 clientConn = conn 477 } else { 478 stream, err := NodeStreamingRpc(state.Session, "Allocations.Exec") 479 if err != nil { 480 handleStreamResultError(err, nil, encoder) 481 return 482 } 483 clientConn = stream 484 } 485 defer clientConn.Close() 486 487 // Send the request. 488 outEncoder := codec.NewEncoder(clientConn, structs.MsgpackHandle) 489 if err := outEncoder.Encode(args); err != nil { 490 handleStreamResultError(err, nil, encoder) 491 return 492 } 493 494 structs.Bridge(conn, clientConn) 495 }