github.com/criteo-forks/consul@v1.4.5-criteonogrpc/agent/consul/rpc.go (about) 1 package consul 2 3 import ( 4 "crypto/tls" 5 "fmt" 6 "io" 7 "net" 8 "strings" 9 "sync/atomic" 10 "time" 11 12 "github.com/armon/go-metrics" 13 "github.com/hashicorp/consul/agent/cache" 14 "github.com/hashicorp/consul/agent/consul/state" 15 "github.com/hashicorp/consul/agent/metadata" 16 "github.com/hashicorp/consul/agent/pool" 17 "github.com/hashicorp/consul/agent/structs" 18 "github.com/hashicorp/consul/lib" 19 memdb "github.com/hashicorp/go-memdb" 20 "github.com/hashicorp/memberlist" 21 msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc" 22 "github.com/hashicorp/yamux" 23 ) 24 25 const ( 26 // maxQueryTime is used to bound the limit of a blocking query 27 maxQueryTime = 600 * time.Second 28 29 // defaultQueryTime is the amount of time we block waiting for a change 30 // if no time is specified. Previously we would wait the maxQueryTime. 31 defaultQueryTime = 300 * time.Second 32 33 // jitterFraction is a the limit to the amount of jitter we apply 34 // to a user specified MaxQueryTime. We divide the specified time by 35 // the fraction. So 16 == 6.25% limit of jitter. This same fraction 36 // is applied to the RPCHoldTimeout 37 jitterFraction = 16 38 39 // Warn if the Raft command is larger than this. 40 // If it's over 1MB something is probably being abusive. 41 raftWarnSize = 1024 * 1024 42 43 // enqueueLimit caps how long we will wait to enqueue 44 // a new Raft command. Something is probably wrong if this 45 // value is ever reached. However, it prevents us from blocking 46 // the requesting goroutine forever. 47 enqueueLimit = 30 * time.Second 48 ) 49 50 // listen is used to listen for incoming RPC connections 51 func (s *Server) listen(listener net.Listener) { 52 for { 53 // Accept a connection 54 conn, err := listener.Accept() 55 if err != nil { 56 if s.shutdown { 57 return 58 } 59 s.logger.Printf("[ERR] consul.rpc: failed to accept RPC conn: %v", err) 60 continue 61 } 62 63 go s.handleConn(conn, false) 64 metrics.IncrCounter([]string{"rpc", "accept_conn"}, 1) 65 } 66 } 67 68 // logConn is a wrapper around memberlist's LogConn so that we format references 69 // to "from" addresses in a consistent way. This is just a shorter name. 70 func logConn(conn net.Conn) string { 71 return memberlist.LogConn(conn) 72 } 73 74 // handleConn is used to determine if this is a Raft or 75 // Consul type RPC connection and invoke the correct handler 76 func (s *Server) handleConn(conn net.Conn, isTLS bool) { 77 // Read a single byte 78 buf := make([]byte, 1) 79 if _, err := conn.Read(buf); err != nil { 80 if err != io.EOF { 81 s.logger.Printf("[ERR] consul.rpc: failed to read byte: %v %s", err, logConn(conn)) 82 } 83 conn.Close() 84 return 85 } 86 typ := pool.RPCType(buf[0]) 87 88 // Enforce TLS if VerifyIncoming is set 89 if s.config.VerifyIncoming && !isTLS && typ != pool.RPCTLS { 90 s.logger.Printf("[WARN] consul.rpc: Non-TLS connection attempted with VerifyIncoming set %s", logConn(conn)) 91 conn.Close() 92 return 93 } 94 95 // Switch on the byte 96 switch typ { 97 case pool.RPCConsul: 98 s.handleConsulConn(conn) 99 100 case pool.RPCRaft: 101 metrics.IncrCounter([]string{"rpc", "raft_handoff"}, 1) 102 s.raftLayer.Handoff(conn) 103 104 case pool.RPCTLS: 105 if s.rpcTLS == nil { 106 s.logger.Printf("[WARN] consul.rpc: TLS connection attempted, server not configured for TLS %s", logConn(conn)) 107 conn.Close() 108 return 109 } 110 conn = tls.Server(conn, s.rpcTLS) 111 s.handleConn(conn, true) 112 113 case pool.RPCMultiplexV2: 114 s.handleMultiplexV2(conn) 115 116 case pool.RPCSnapshot: 117 s.handleSnapshotConn(conn) 118 119 default: 120 if !s.handleEnterpriseRPCConn(typ, conn, isTLS) { 121 s.logger.Printf("[ERR] consul.rpc: unrecognized RPC byte: %v %s", typ, logConn(conn)) 122 conn.Close() 123 } 124 } 125 } 126 127 // handleMultiplexV2 is used to multiplex a single incoming connection 128 // using the Yamux multiplexer 129 func (s *Server) handleMultiplexV2(conn net.Conn) { 130 defer conn.Close() 131 conf := yamux.DefaultConfig() 132 conf.LogOutput = s.config.LogOutput 133 server, _ := yamux.Server(conn, conf) 134 for { 135 sub, err := server.Accept() 136 if err != nil { 137 if err != io.EOF { 138 s.logger.Printf("[ERR] consul.rpc: multiplex conn accept failed: %v %s", err, logConn(conn)) 139 } 140 return 141 } 142 go s.handleConsulConn(sub) 143 } 144 } 145 146 // handleConsulConn is used to service a single Consul RPC connection 147 func (s *Server) handleConsulConn(conn net.Conn) { 148 defer conn.Close() 149 rpcCodec := msgpackrpc.NewServerCodec(conn) 150 for { 151 select { 152 case <-s.shutdownCh: 153 return 154 default: 155 } 156 157 if err := s.rpcServer.ServeRequest(rpcCodec); err != nil { 158 if err != io.EOF && !strings.Contains(err.Error(), "closed") { 159 s.logger.Printf("[ERR] consul.rpc: RPC error: %v %s", err, logConn(conn)) 160 metrics.IncrCounter([]string{"rpc", "request_error"}, 1) 161 } 162 return 163 } 164 metrics.IncrCounter([]string{"rpc", "request"}, 1) 165 } 166 } 167 168 // handleSnapshotConn is used to dispatch snapshot saves and restores, which 169 // stream so don't use the normal RPC mechanism. 170 func (s *Server) handleSnapshotConn(conn net.Conn) { 171 go func() { 172 defer conn.Close() 173 if err := s.handleSnapshotRequest(conn); err != nil { 174 s.logger.Printf("[ERR] consul.rpc: Snapshot RPC error: %v %s", err, logConn(conn)) 175 } 176 }() 177 } 178 179 // canRetry returns true if the given situation is safe for a retry. 180 func canRetry(args interface{}, err error) bool { 181 // No leader errors are always safe to retry since no state could have 182 // been changed. 183 if structs.IsErrNoLeader(err) { 184 return true 185 } 186 187 // Reads are safe to retry for stream errors, such as if a server was 188 // being shut down. 189 info, ok := args.(structs.RPCInfo) 190 if ok && info.IsRead() && lib.IsErrEOF(err) { 191 return true 192 } 193 194 return false 195 } 196 197 // forward is used to forward to a remote DC or to forward to the local leader 198 // Returns a bool of if forwarding was performed, as well as any error 199 func (s *Server) forward(method string, info structs.RPCInfo, args interface{}, reply interface{}) (bool, error) { 200 var firstCheck time.Time 201 202 // Handle DC forwarding 203 dc := info.RequestDatacenter() 204 if dc != s.config.Datacenter { 205 err := s.forwardDC(method, dc, args, reply) 206 return true, err 207 } 208 209 // Check if we can allow a stale read, ensure our local DB is initialized 210 if info.IsRead() && info.AllowStaleRead() && !s.raft.LastContact().IsZero() { 211 return false, nil 212 } 213 214 CHECK_LEADER: 215 // Fail fast if we are in the process of leaving 216 select { 217 case <-s.leaveCh: 218 return true, structs.ErrNoLeader 219 default: 220 } 221 222 // Find the leader 223 isLeader, leader := s.getLeader() 224 225 // Handle the case we are the leader 226 if isLeader { 227 return false, nil 228 } 229 230 // Handle the case of a known leader 231 rpcErr := structs.ErrNoLeader 232 if leader != nil { 233 rpcErr = s.connPool.RPC(s.config.Datacenter, leader.Addr, 234 leader.Version, method, leader.UseTLS, args, reply) 235 if rpcErr != nil && canRetry(info, rpcErr) { 236 goto RETRY 237 } 238 return true, rpcErr 239 } 240 241 RETRY: 242 // Gate the request until there is a leader 243 if firstCheck.IsZero() { 244 firstCheck = time.Now() 245 } 246 if time.Since(firstCheck) < s.config.RPCHoldTimeout { 247 jitter := lib.RandomStagger(s.config.RPCHoldTimeout / jitterFraction) 248 select { 249 case <-time.After(jitter): 250 goto CHECK_LEADER 251 case <-s.leaveCh: 252 case <-s.shutdownCh: 253 } 254 } 255 256 // No leader found and hold time exceeded 257 return true, rpcErr 258 } 259 260 // getLeader returns if the current node is the leader, and if not then it 261 // returns the leader which is potentially nil if the cluster has not yet 262 // elected a leader. 263 func (s *Server) getLeader() (bool, *metadata.Server) { 264 // Check if we are the leader 265 if s.IsLeader() { 266 return true, nil 267 } 268 269 // Get the leader 270 leader := s.raft.Leader() 271 if leader == "" { 272 return false, nil 273 } 274 275 // Lookup the server 276 server := s.serverLookup.Server(leader) 277 278 // Server could be nil 279 return false, server 280 } 281 282 // forwardDC is used to forward an RPC call to a remote DC, or fail if no servers 283 func (s *Server) forwardDC(method, dc string, args interface{}, reply interface{}) error { 284 manager, server, ok := s.router.FindRoute(dc) 285 if !ok { 286 s.logger.Printf("[WARN] consul.rpc: RPC request for DC %q, no path found", dc) 287 return structs.ErrNoDCPath 288 } 289 290 metrics.IncrCounterWithLabels([]string{"rpc", "cross-dc"}, 1, 291 []metrics.Label{{Name: "datacenter", Value: dc}}) 292 if err := s.connPool.RPC(dc, server.Addr, server.Version, method, server.UseTLS, args, reply); err != nil { 293 manager.NotifyFailedServer(server) 294 s.logger.Printf("[ERR] consul: RPC failed to server %s in DC %q: %v", server.Addr, dc, err) 295 return err 296 } 297 298 return nil 299 } 300 301 // globalRPC is used to forward an RPC request to one server in each datacenter. 302 // This will only error for RPC-related errors. Otherwise, application-level 303 // errors can be sent in the response objects. 304 func (s *Server) globalRPC(method string, args interface{}, 305 reply structs.CompoundResponse) error { 306 307 // Make a new request into each datacenter 308 dcs := s.router.GetDatacenters() 309 310 replies, total := 0, len(dcs) 311 errorCh := make(chan error, total) 312 respCh := make(chan interface{}, total) 313 314 for _, dc := range dcs { 315 go func(dc string) { 316 rr := reply.New() 317 if err := s.forwardDC(method, dc, args, &rr); err != nil { 318 errorCh <- err 319 return 320 } 321 respCh <- rr 322 }(dc) 323 } 324 325 for replies < total { 326 select { 327 case err := <-errorCh: 328 return err 329 case rr := <-respCh: 330 reply.Add(rr) 331 replies++ 332 } 333 } 334 return nil 335 } 336 337 // raftApply is used to encode a message, run it through raft, and return 338 // the FSM response along with any errors 339 func (s *Server) raftApply(t structs.MessageType, msg interface{}) (interface{}, error) { 340 buf, err := structs.Encode(t, msg) 341 if err != nil { 342 return nil, fmt.Errorf("Failed to encode request: %v", err) 343 } 344 345 // Warn if the command is very large 346 if n := len(buf); n > raftWarnSize { 347 s.logger.Printf("[WARN] consul: Attempting to apply large raft entry (%d bytes)", n) 348 } 349 350 future := s.raft.Apply(buf, enqueueLimit) 351 if err := future.Error(); err != nil { 352 return nil, err 353 } 354 355 return future.Response(), nil 356 } 357 358 // queryFn is used to perform a query operation. If a re-query is needed, the 359 // passed-in watch set will be used to block for changes. The passed-in state 360 // store should be used (vs. calling fsm.State()) since the given state store 361 // will be correctly watched for changes if the state store is restored from 362 // a snapshot. 363 type queryFn func(memdb.WatchSet, *state.Store) error 364 365 // blockingQuery is used to process a potentially blocking query operation. 366 func (s *Server) blockingQuery(queryOpts *structs.QueryOptions, queryMeta *structs.QueryMeta, 367 fn queryFn) error { 368 var timeout *time.Timer 369 370 // Fast path right to the non-blocking query. 371 if queryOpts.MinQueryIndex == 0 { 372 goto RUN_QUERY 373 } 374 375 // Restrict the max query time, and ensure there is always one. 376 if queryOpts.MaxQueryTime > maxQueryTime { 377 queryOpts.MaxQueryTime = maxQueryTime 378 } else if queryOpts.MaxQueryTime <= 0 { 379 queryOpts.MaxQueryTime = defaultQueryTime 380 } 381 382 // Apply a small amount of jitter to the request. 383 queryOpts.MaxQueryTime += lib.RandomStagger(queryOpts.MaxQueryTime / jitterFraction) 384 385 // Setup a query timeout. 386 timeout = time.NewTimer(queryOpts.MaxQueryTime) 387 defer timeout.Stop() 388 389 RUN_QUERY: 390 // Update the query metadata. 391 s.setQueryMeta(queryMeta) 392 393 // If the read must be consistent we verify that we are still the leader. 394 if queryOpts.RequireConsistent { 395 if err := s.consistentRead(); err != nil { 396 return err 397 } 398 } 399 400 // Run the query. 401 metrics.IncrCounter([]string{"rpc", "query"}, 1) 402 403 // Operate on a consistent set of state. This makes sure that the 404 // abandon channel goes with the state that the caller is using to 405 // build watches. 406 state := s.fsm.State() 407 408 // We can skip all watch tracking if this isn't a blocking query. 409 var ws memdb.WatchSet 410 if queryOpts.MinQueryIndex > 0 { 411 ws = memdb.NewWatchSet() 412 413 // This channel will be closed if a snapshot is restored and the 414 // whole state store is abandoned. 415 ws.Add(state.AbandonCh()) 416 } 417 418 // Block up to the timeout if we didn't see anything fresh. 419 err := fn(ws, state) 420 // Note we check queryOpts.MinQueryIndex is greater than zero to determine if 421 // blocking was requested by client, NOT meta.Index since the state function 422 // might return zero if something is not initialized and care wasn't taken to 423 // handle that special case (in practice this happened a lot so fixing it 424 // systematically here beats trying to remember to add zero checks in every 425 // state method). We also need to ensure that unless there is an error, we 426 // return an index > 0 otherwise the client will never block and burn CPU and 427 // requests. 428 if err == nil && queryMeta.Index < 1 { 429 queryMeta.Index = 1 430 } 431 if err == nil && queryOpts.MinQueryIndex > 0 && queryMeta.Index <= queryOpts.MinQueryIndex { 432 if expired := ws.Watch(timeout.C); !expired { 433 // If a restore may have woken us up then bail out from 434 // the query immediately. This is slightly race-ey since 435 // this might have been interrupted for other reasons, 436 // but it's OK to kick it back to the caller in either 437 // case. 438 select { 439 case <-state.AbandonCh(): 440 default: 441 goto RUN_QUERY 442 } 443 } 444 } 445 return err 446 } 447 448 type sharedQueryFn func(memdb.WatchSet, *state.Store) (uint64, func(uint64, interface{}) error, error) 449 450 func (s *Server) sharedBlockingQuery(req cache.Request, res interface{}, queryOpts *structs.QueryOptions, queryMeta *structs.QueryMeta, fn sharedQueryFn) error { 451 var timeout *time.Timer 452 453 if queryOpts.RequireConsistent { 454 if err := s.consistentRead(); err != nil { 455 return err 456 } 457 } 458 459 // Restrict the max query time, and ensure there is always one. 460 if queryOpts.MaxQueryTime > maxQueryTime { 461 queryOpts.MaxQueryTime = maxQueryTime 462 } else if queryOpts.MaxQueryTime <= 0 { 463 queryOpts.MaxQueryTime = defaultQueryTime 464 } 465 466 // Apply a small amount of jitter to the request. 467 queryOpts.MaxQueryTime += lib.RandomStagger(queryOpts.MaxQueryTime / jitterFraction) 468 469 // Setup a query timeout. 470 timeout = time.NewTimer(queryOpts.MaxQueryTime) 471 defer timeout.Stop() 472 473 cacheInfo := req.CacheInfo() 474 475 // Check if a blocking query is already runnning for this request 476 s.blockingQueriesLock.RLock() 477 queryState, alreadyInserted := s.blockingQueries[cacheInfo.Key] 478 s.blockingQueriesLock.RUnlock() 479 480 // If not, run one 481 if !alreadyInserted { 482 ws := memdb.NewWatchSet() 483 // run the func a first time to get the index 484 firstRunIndex, apply, err := fn(ws, s.fsm.State()) 485 486 s.blockingQueriesLock.Lock() 487 queryState, alreadyInserted = s.blockingQueries[cacheInfo.Key] 488 if alreadyInserted { 489 // Another query raced with us and already ran a blocking query 490 s.blockingQueriesLock.Unlock() 491 } else { 492 // Add query to map 493 queryState = newBlockingQueryState(firstRunIndex, apply, err) 494 s.blockingQueries[cacheInfo.Key] = queryState 495 s.blockingQueriesLock.Unlock() 496 497 // Run the shared blocking query 498 go s.runSharedBlockingQuery(firstRunIndex, cacheInfo.Key, ws, queryMeta, queryState, fn) 499 } 500 } 501 502 stateIndex := atomic.LoadUint64(&queryState.Index) 503 504 if stateIndex <= queryOpts.MinQueryIndex { 505 // Increment the shared query watcher 506 atomic.AddInt32(&queryState.Watchers, 1) 507 508 // block on either timeout or shared query 509 select { 510 case <-timeout.C: 511 if n := atomic.AddInt32(&queryState.Watchers, -1); n == 0 { 512 s.logger.Println("[TRACE] consul: cancelling shared blocking query because there is no more watchers") 513 514 // we were the last request to wait on the shared blocking wuery and we reached MaxQueryTime, cancel the blocking query 515 close(queryState.Cancel) 516 } 517 case <-queryState.Done: 518 } 519 } 520 521 if err := queryState.Err.Load(); err != nil { 522 return err.(error) 523 } 524 525 return queryState.Apply.Load().(func(uint64, interface{}) error)(atomic.LoadUint64(&queryState.Index), res) 526 } 527 528 func (s *Server) runSharedBlockingQuery(index uint64, cacheKey string, ws memdb.WatchSet, queryMeta *structs.QueryMeta, queryState *blockingQueryState, fn sharedQueryFn) { 529 s.logger.Println("[TRACE] consul: running shared blocking query") 530 531 // Wait initial query watchset 532 expired := ws.Watch(queryState.Cancel) 533 534 RUN_QUERY: 535 536 // If the read must be consistent we verify that we are still the leader. 537 538 // Run the query. 539 metrics.IncrCounter([]string{"rpc", "query"}, 1) 540 541 // Operate on a consistent set of state. This makes sure that the 542 // abandon channel goes with the state that the caller is using to 543 // build watches. 544 state := s.fsm.State() 545 546 // We can skip all watch tracking if this isn't a blocking query. 547 if index > 0 { 548 ws = memdb.NewWatchSet() 549 550 // This channel will be closed if a snapshot is restored and the 551 // whole state store is abandoned. 552 ws.Add(state.AbandonCh()) 553 } 554 555 // Block up to the timeout if we didn't see anything fresh. 556 idx, apply, err := fn(ws, state) 557 // Note we check queryOpts.MinQueryIndex is greater than zero to determine if 558 // blocking was requested by client, NOT meta.Index since the state function 559 // might return zero if something is not initialised and care wasn't taken to 560 // handle that special case (in practice this happened a lot so fixing it 561 // systematically here beats trying to remember to add zero checks in every 562 // state method). We also need to ensure that unless there is an error, we 563 // return an index > 0 otherwise the client will never block and burn CPU and 564 // requests. 565 if err == nil && idx < 1 { 566 idx = 1 567 } 568 if !expired && err == nil && index > 0 && idx <= index { 569 if expired := ws.Watch(queryState.Cancel); !expired { 570 // If a restore may have woken us up then bail out from 571 // the query immediately. This is slightly race-ey since 572 // this might have been interrupted for other reasons, 573 // but it's OK to kick it back to the caller in either 574 // case. 575 select { 576 case <-state.AbandonCh(): 577 default: 578 goto RUN_QUERY 579 } 580 } 581 } 582 583 // store results 584 s.blockingQueriesLock.Lock() 585 if err != nil { 586 queryState.Err.Store(err) 587 } 588 if apply != nil { 589 queryState.Apply.Store(apply) 590 } 591 atomic.StoreUint64(&queryState.Index, idx) 592 delete(s.blockingQueries, cacheKey) 593 s.blockingQueriesLock.Unlock() 594 595 // notify changed 596 close(queryState.Done) 597 } 598 599 // setQueryMeta is used to populate the QueryMeta data for an RPC call 600 func (s *Server) setQueryMeta(m *structs.QueryMeta) { 601 if s.IsLeader() { 602 m.LastContact = 0 603 m.KnownLeader = true 604 } else { 605 m.LastContact = time.Since(s.raft.LastContact()) 606 m.KnownLeader = (s.raft.Leader() != "") 607 } 608 } 609 610 // consistentRead is used to ensure we do not perform a stale 611 // read. This is done by verifying leadership before the read. 612 func (s *Server) consistentRead() error { 613 defer metrics.MeasureSince([]string{"rpc", "consistentRead"}, time.Now()) 614 future := s.raft.VerifyLeader() 615 if err := future.Error(); err != nil { 616 return err //fail fast if leader verification fails 617 } 618 // poll consistent read readiness, wait for up to RPCHoldTimeout milliseconds 619 if s.isReadyForConsistentReads() { 620 return nil 621 } 622 jitter := lib.RandomStagger(s.config.RPCHoldTimeout / jitterFraction) 623 deadline := time.Now().Add(s.config.RPCHoldTimeout) 624 625 for time.Now().Before(deadline) { 626 627 select { 628 case <-time.After(jitter): 629 // Drop through and check before we loop again. 630 631 case <-s.shutdownCh: 632 return fmt.Errorf("shutdown waiting for leader") 633 } 634 635 if s.isReadyForConsistentReads() { 636 return nil 637 } 638 } 639 640 return structs.ErrNotReadyForConsistentReads 641 }