github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/flowinfra/flow_registry.go (about) 1 // Copyright 2016 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package flowinfra 12 13 import ( 14 "context" 15 "fmt" 16 "sync" 17 "time" 18 19 "github.com/cockroachdb/cockroach/pkg/base" 20 "github.com/cockroachdb/cockroach/pkg/settings" 21 "github.com/cockroachdb/cockroach/pkg/sql/execinfra" 22 "github.com/cockroachdb/cockroach/pkg/sql/execinfrapb" 23 "github.com/cockroachdb/cockroach/pkg/util/log" 24 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 25 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 26 "github.com/cockroachdb/errors" 27 "github.com/opentracing/opentracing-go" 28 ) 29 30 var errNoInboundStreamConnection = errors.New("no inbound stream connection") 31 32 // SettingFlowStreamTimeout is a cluster setting that sets the default flow 33 // stream timeout. 34 var SettingFlowStreamTimeout = settings.RegisterNonNegativeDurationSetting( 35 "sql.distsql.flow_stream_timeout", 36 "amount of time incoming streams wait for a flow to be set up before erroring out", 37 10*time.Second, 38 ) 39 40 // expectedConnectionTime is the expected time taken by a flow to connect to its 41 // consumers. 42 const expectedConnectionTime time.Duration = 500 * time.Millisecond 43 44 // InboundStreamInfo represents the endpoint where a data stream from another 45 // node connects to a flow. The external node initiates this process through a 46 // FlowStream RPC, which uses (*Flow).connectInboundStream() to associate the 47 // stream to a receiver to push rows to. 48 type InboundStreamInfo struct { 49 // receiver is the entity that will receive rows from another host, which is 50 // part of a processor (normally an input synchronizer) for row-based 51 // execution and a colrpc.Inbox for vectorized execution. 52 // 53 // During a FlowStream RPC, the stream is handed off to this strategy to 54 // process. 55 receiver InboundStreamHandler 56 connected bool 57 // if set, indicates that we waited too long for an inbound connection, or 58 // we don't want this stream to connect anymore due to flow cancellation. 59 canceled bool 60 // finished is set if we have signaled that the stream is done transferring 61 // rows (to the flow's wait group). 62 finished bool 63 64 // waitGroup to signal on when finished. 65 waitGroup *sync.WaitGroup 66 } 67 68 // NewInboundStreamInfo returns a new InboundStreamInfo. 69 func NewInboundStreamInfo( 70 receiver InboundStreamHandler, waitGroup *sync.WaitGroup, 71 ) *InboundStreamInfo { 72 return &InboundStreamInfo{ 73 receiver: receiver, 74 waitGroup: waitGroup, 75 } 76 } 77 78 // flowEntry is a structure associated with a (potential) flow. 79 type flowEntry struct { 80 // waitCh is set if one or more clients are waiting for the flow; the 81 // channel gets closed when the flow is registered. 82 waitCh chan struct{} 83 84 // refCount is used to allow multiple clients to wait for a flow - if the 85 // flow never shows up, the refCount is used to decide which client cleans 86 // up the entry. 87 refCount int 88 89 flow *FlowBase 90 91 // inboundStreams are streams that receive data from other hosts, through the 92 // FlowStream API. All fields in the inboundStreamInfos are protected by the 93 // FlowRegistry mutex (except the receiver, whose methods can be called 94 // freely). 95 inboundStreams map[execinfrapb.StreamID]*InboundStreamInfo 96 97 // streamTimer is a timer that fires after a timeout and verifies that all 98 // inbound streams have been connected. 99 streamTimer *time.Timer 100 } 101 102 // FlowRegistry allows clients to look up flows by ID and to wait for flows to 103 // be registered. Multiple clients can wait concurrently for the same flow. 104 type FlowRegistry struct { 105 syncutil.Mutex 106 107 // All fields in the flowEntry's are protected by the FlowRegistry mutex, 108 // except flow, whose methods can be called freely. 109 flows map[execinfrapb.FlowID]*flowEntry 110 111 // draining specifies whether the FlowRegistry is in drain mode. If it is, 112 // the FlowRegistry will not accept new flows. 113 draining bool 114 115 // flowDone is signaled whenever the size of flows decreases. 116 flowDone *sync.Cond 117 118 // testingRunBeforeDrainSleep is a testing knob executed when a draining 119 // FlowRegistry has no registered flows but must still wait for a minimum time 120 // for any incoming flows to register. 121 testingRunBeforeDrainSleep func() 122 } 123 124 // NewFlowRegistry creates a new FlowRegistry. 125 // 126 // instID is the ID of the current node. Used for debugging; pass 0 if you don't 127 // care. 128 func NewFlowRegistry(instID base.SQLInstanceID) *FlowRegistry { 129 fr := &FlowRegistry{flows: make(map[execinfrapb.FlowID]*flowEntry)} 130 fr.flowDone = sync.NewCond(fr) 131 return fr 132 } 133 134 // getEntryLocked returns the flowEntry associated with the id. If the entry 135 // doesn't exist, one is created and inserted into the map. 136 // It should only be called while holding the mutex. 137 func (fr *FlowRegistry) getEntryLocked(id execinfrapb.FlowID) *flowEntry { 138 entry, ok := fr.flows[id] 139 if !ok { 140 entry = &flowEntry{} 141 fr.flows[id] = entry 142 } 143 return entry 144 } 145 146 // releaseEntryLocked decreases the refCount in the entry for the given id, and 147 // cleans up the entry if the refCount reaches 0. 148 // It should only be called while holding the mutex. 149 func (fr *FlowRegistry) releaseEntryLocked(id execinfrapb.FlowID) { 150 entry := fr.flows[id] 151 if entry.refCount > 1 { 152 entry.refCount-- 153 } else { 154 if entry.refCount != 1 { 155 panic(fmt.Sprintf("invalid refCount: %d", entry.refCount)) 156 } 157 delete(fr.flows, id) 158 fr.flowDone.Signal() 159 } 160 } 161 162 type flowRetryableError struct { 163 cause error 164 } 165 166 func (e *flowRetryableError) Error() string { 167 return fmt.Sprintf("flow retryable error: %+v", e.cause) 168 } 169 170 // IsFlowRetryableError returns true if an error represents a retryable 171 // flow error. 172 func IsFlowRetryableError(e error) bool { 173 return errors.HasType(e, (*flowRetryableError)(nil)) 174 } 175 176 // RegisterFlow makes a flow accessible to ConnectInboundStream. Any concurrent 177 // ConnectInboundStream calls that are waiting for this flow are woken up. 178 // 179 // It is expected that UnregisterFlow will be called at some point to remove the 180 // flow from the registry. 181 // 182 // inboundStreams are all the remote streams that will be connected into this 183 // flow. If any of them is not connected within timeout, errors are propagated. 184 // The inboundStreams are expected to have been initialized with their 185 // WaitGroups (the group should have been incremented). RegisterFlow takes 186 // responsibility for calling Done() on that WaitGroup; this responsibility will 187 // be forwarded forward by ConnectInboundStream. In case this method returns an 188 // error, the WaitGroup will be decremented. 189 func (fr *FlowRegistry) RegisterFlow( 190 ctx context.Context, 191 id execinfrapb.FlowID, 192 f *FlowBase, 193 inboundStreams map[execinfrapb.StreamID]*InboundStreamInfo, 194 timeout time.Duration, 195 ) (retErr error) { 196 fr.Lock() 197 defer fr.Unlock() 198 defer func() { 199 if retErr != nil { 200 for _, stream := range inboundStreams { 201 stream.waitGroup.Done() 202 } 203 } 204 }() 205 206 draining := fr.draining 207 if f.Cfg != nil { 208 if knobs, ok := f.Cfg.TestingKnobs.Flowinfra.(*TestingKnobs); ok && knobs != nil && knobs.FlowRegistryDraining != nil { 209 draining = knobs.FlowRegistryDraining() 210 } 211 } 212 213 if draining { 214 return &flowRetryableError{cause: errors.Errorf( 215 "could not register flowID %d because the registry is draining", 216 id, 217 )} 218 } 219 entry := fr.getEntryLocked(id) 220 if entry.flow != nil { 221 return errors.Errorf( 222 "flow already registered: flowID: %s.\n"+ 223 "Current flow: %+v\nExisting flow: %+v", 224 f.spec.FlowID, f.spec, entry.flow.spec) 225 } 226 // Take a reference that will be removed by UnregisterFlow. 227 entry.refCount++ 228 entry.flow = f 229 entry.inboundStreams = inboundStreams 230 // If there are any waiters, wake them up by closing waitCh. 231 if entry.waitCh != nil { 232 close(entry.waitCh) 233 } 234 235 if len(inboundStreams) > 0 { 236 // Set up a function to time out inbound streams after a while. 237 entry.streamTimer = time.AfterFunc(timeout, func() { 238 fr.Lock() 239 // We're giving up waiting for these inbound streams. We will push an 240 // error to its consumer after fr.Unlock; the error will propagate and 241 // eventually drain all the processors. 242 timedOutReceivers := fr.cancelPendingStreamsLocked(id) 243 fr.Unlock() 244 if len(timedOutReceivers) != 0 { 245 // The span in the context might be finished by the time this runs. In 246 // principle, we could ForkCtxSpan() beforehand, but we don't want to 247 // create the extra span every time. 248 timeoutCtx := opentracing.ContextWithSpan(ctx, nil) 249 log.Errorf( 250 timeoutCtx, 251 "flow id:%s : %d inbound streams timed out after %s; propagated error throughout flow", 252 id, 253 len(timedOutReceivers), 254 timeout, 255 ) 256 } 257 for _, r := range timedOutReceivers { 258 go func(r InboundStreamHandler) { 259 r.Timeout(errNoInboundStreamConnection) 260 }(r) 261 } 262 }) 263 } 264 return nil 265 } 266 267 // cancelPendingStreamsLocked cancels all of the streams that haven't been 268 // connected yet in this flow, by setting them to finished and ending their 269 // wait group. The method returns the list of RowReceivers corresponding to the 270 // streams that were canceled. The caller is expected to send those 271 // RowReceivers a cancellation message - this method can't do it because sending 272 // those messages shouldn't happen under the flow registry's lock. 273 func (fr *FlowRegistry) cancelPendingStreamsLocked(id execinfrapb.FlowID) []InboundStreamHandler { 274 entry := fr.flows[id] 275 if entry == nil || entry.flow == nil { 276 return nil 277 } 278 pendingReceivers := make([]InboundStreamHandler, 0) 279 for streamID, is := range entry.inboundStreams { 280 // Connected, non-finished inbound streams will get an error 281 // returned in ProcessInboundStream(). Non-connected streams 282 // are handled below. 283 if !is.connected && !is.finished && !is.canceled { 284 is.canceled = true 285 pendingReceivers = append(pendingReceivers, is.receiver) 286 fr.finishInboundStreamLocked(id, streamID) 287 } 288 } 289 return pendingReceivers 290 } 291 292 // UnregisterFlow removes a flow from the registry. Any subsequent 293 // ConnectInboundStream calls for the flow will fail to find it and time out. 294 func (fr *FlowRegistry) UnregisterFlow(id execinfrapb.FlowID) { 295 fr.Lock() 296 entry := fr.flows[id] 297 if entry.streamTimer != nil { 298 entry.streamTimer.Stop() 299 entry.streamTimer = nil 300 } 301 fr.releaseEntryLocked(id) 302 fr.Unlock() 303 } 304 305 // waitForFlowLocked waits until the flow with the given id gets registered - 306 // up to the given timeout - and returns the flowEntry. If the timeout elapses, 307 // returns nil. It should only be called while holding the mutex. The mutex is 308 // temporarily unlocked if we need to wait. 309 // It is illegal to call this if the flow is already connected. 310 func (fr *FlowRegistry) waitForFlowLocked( 311 ctx context.Context, id execinfrapb.FlowID, timeout time.Duration, 312 ) *flowEntry { 313 entry := fr.getEntryLocked(id) 314 if entry.flow != nil { 315 log.Fatalf(ctx, "waitForFlowLocked called for a flow that's already registered: %d", id) 316 } 317 318 // Flow not registered (at least not yet). 319 320 // Set up a channel that gets closed when the flow shows up, or when the 321 // timeout elapses. The channel might have been created already if there are 322 // other waiters for the same id. 323 waitCh := entry.waitCh 324 if waitCh == nil { 325 waitCh = make(chan struct{}) 326 entry.waitCh = waitCh 327 } 328 entry.refCount++ 329 fr.Unlock() 330 331 select { 332 case <-waitCh: 333 case <-time.After(timeout): 334 case <-ctx.Done(): 335 } 336 337 fr.Lock() 338 339 fr.releaseEntryLocked(id) 340 if entry.flow == nil { 341 return nil 342 } 343 344 return entry 345 } 346 347 // Drain waits at most flowDrainWait for currently running flows to finish and 348 // at least minFlowDrainWait for any incoming flows to be registered. If there 349 // are still flows active after flowDrainWait, Drain waits an extra 350 // expectedConnectionTime so that any flows that were registered at the end of 351 // the time window have a reasonable amount of time to connect to their 352 // consumers, thus unblocking them. 353 // The FlowRegistry rejects any new flows once it has finished draining. 354 // 355 // Note that since local flows are not added to the registry, they are not 356 // waited for. However, this is fine since there should be no local flows 357 // running when the FlowRegistry drains as the draining logic starts with 358 // draining all client connections to a node. 359 // 360 // The reporter callback, if non-nil, is called on a best effort basis 361 // to report work that needed to be done and which may or may not have 362 // been done by the time this call returns. See the explanation in 363 // pkg/server/drain.go for details. 364 func (fr *FlowRegistry) Drain( 365 flowDrainWait time.Duration, minFlowDrainWait time.Duration, reporter func(int, string), 366 ) { 367 allFlowsDone := make(chan struct{}, 1) 368 start := timeutil.Now() 369 stopWaiting := false 370 371 sleep := func(t time.Duration) { 372 if fr.testingRunBeforeDrainSleep != nil { 373 fr.testingRunBeforeDrainSleep() 374 } 375 time.Sleep(t) 376 } 377 378 defer func() { 379 // At this stage, we have either hit the flowDrainWait timeout or we have no 380 // flows left. We wait for an expectedConnectionTime longer so that we give 381 // any flows that were registered in the 382 // flowDrainWait - expectedConnectionTime window enough time to establish 383 // connections to their consumers so that the consumers do not block for a 384 // long time waiting for a connection to be established. 385 fr.Lock() 386 fr.draining = true 387 if len(fr.flows) > 0 { 388 fr.Unlock() 389 time.Sleep(expectedConnectionTime) 390 fr.Lock() 391 } 392 fr.Unlock() 393 }() 394 395 fr.Lock() 396 if len(fr.flows) == 0 { 397 fr.Unlock() 398 sleep(minFlowDrainWait) 399 fr.Lock() 400 // No flows were registered, return. 401 if len(fr.flows) == 0 { 402 fr.Unlock() 403 return 404 } 405 } 406 if reporter != nil { 407 // Report progress to the Drain RPC. 408 reporter(len(fr.flows), "distSQL execution flows") 409 } 410 411 go func() { 412 select { 413 case <-time.After(flowDrainWait): 414 fr.Lock() 415 stopWaiting = true 416 fr.flowDone.Signal() 417 fr.Unlock() 418 case <-allFlowsDone: 419 } 420 }() 421 422 for !(stopWaiting || len(fr.flows) == 0) { 423 fr.flowDone.Wait() 424 } 425 fr.Unlock() 426 427 // If we spent less time waiting for all registered flows to finish, wait 428 // for the minimum time for any new incoming flows and wait for these to 429 // finish. 430 waitTime := timeutil.Since(start) 431 if waitTime < minFlowDrainWait { 432 sleep(minFlowDrainWait - waitTime) 433 fr.Lock() 434 for !(stopWaiting || len(fr.flows) == 0) { 435 fr.flowDone.Wait() 436 } 437 fr.Unlock() 438 } 439 440 allFlowsDone <- struct{}{} 441 } 442 443 // Undrain causes the FlowRegistry to start accepting flows again. 444 func (fr *FlowRegistry) Undrain() { 445 fr.Lock() 446 fr.draining = false 447 fr.Unlock() 448 } 449 450 // ConnectInboundStream finds the InboundStreamInfo for the given 451 // <flowID,streamID> pair and marks it as connected. It waits up to timeout for 452 // the stream to be registered with the registry. It also sends the handshake 453 // messages to the producer of the stream. 454 // 455 // stream is the inbound stream. 456 // 457 // It returns the Flow that the stream is connecting to, the receiver that the 458 // stream must push data to and a cleanup function that must be called to 459 // unregister the flow from the registry after all the data has been pushed. 460 // 461 // The cleanup function will decrement the flow's WaitGroup, so that Flow.Wait() 462 // is not blocked on this stream any more. 463 // In case an error is returned, the cleanup function is nil, the Flow is not 464 // considered connected and is not cleaned up. 465 func (fr *FlowRegistry) ConnectInboundStream( 466 ctx context.Context, 467 flowID execinfrapb.FlowID, 468 streamID execinfrapb.StreamID, 469 stream execinfrapb.DistSQL_FlowStreamServer, 470 timeout time.Duration, 471 ) (_ *FlowBase, _ InboundStreamHandler, _ func(), retErr error) { 472 fr.Lock() 473 defer fr.Unlock() 474 475 entry := fr.getEntryLocked(flowID) 476 if entry.flow == nil { 477 // Send the handshake message informing the producer that the consumer has 478 // not been scheduled yet. Another handshake will be sent below once the 479 // consumer has been connected. 480 deadline := timeutil.Now().Add(timeout) 481 if err := stream.Send(&execinfrapb.ConsumerSignal{ 482 Handshake: &execinfrapb.ConsumerHandshake{ 483 ConsumerScheduled: false, 484 ConsumerScheduleDeadline: &deadline, 485 Version: execinfra.Version, 486 MinAcceptedVersion: execinfra.MinAcceptedVersion, 487 }, 488 }); err != nil { 489 // TODO(andrei): We failed to send a message to the producer; we'll return 490 // an error and leave this stream with connected == false so it times out 491 // later. We could call finishInboundStreamLocked() now so that the flow 492 // doesn't wait for the timeout and we could remember the error for the 493 // consumer if the consumer comes later, but I'm not sure what the best 494 // way to do that is. Similarly for the 2nd handshake message below, 495 // except there we already have the consumer and we can push the error. 496 return nil, nil, nil, err 497 } 498 entry = fr.waitForFlowLocked(ctx, flowID, timeout) 499 if entry == nil { 500 return nil, nil, nil, errors.Errorf("flow %s not found", flowID) 501 } 502 } 503 504 s, ok := entry.inboundStreams[streamID] 505 if !ok { 506 return nil, nil, nil, errors.Errorf("flow %s: no inbound stream %d", flowID, streamID) 507 } 508 if s.connected { 509 return nil, nil, nil, errors.Errorf("flow %s: inbound stream %d already connected", flowID, streamID) 510 } 511 if s.canceled { 512 return nil, nil, nil, errors.Errorf("flow %s: inbound stream %d came too late", flowID, streamID) 513 } 514 515 // We now mark the stream as connected but, if an error happens later because 516 // the handshake fails, we reset the state; we want the stream to be 517 // considered timed out when the moment comes just as if this connection 518 // attempt never happened. 519 s.connected = true 520 defer func() { 521 if retErr != nil { 522 s.connected = false 523 } 524 }() 525 526 if err := stream.Send(&execinfrapb.ConsumerSignal{ 527 Handshake: &execinfrapb.ConsumerHandshake{ 528 ConsumerScheduled: true, 529 Version: execinfra.Version, 530 MinAcceptedVersion: execinfra.MinAcceptedVersion, 531 }, 532 }); err != nil { 533 return nil, nil, nil, err 534 } 535 536 cleanup := func() { 537 fr.Lock() 538 fr.finishInboundStreamLocked(flowID, streamID) 539 fr.Unlock() 540 } 541 return entry.flow, s.receiver, cleanup, nil 542 } 543 544 func (fr *FlowRegistry) finishInboundStreamLocked( 545 fid execinfrapb.FlowID, sid execinfrapb.StreamID, 546 ) { 547 flowEntry := fr.getEntryLocked(fid) 548 streamEntry := flowEntry.inboundStreams[sid] 549 550 if !streamEntry.connected && !streamEntry.canceled { 551 panic("finising inbound stream that didn't connect or time out") 552 } 553 if streamEntry.finished { 554 panic("double finish") 555 } 556 557 streamEntry.finished = true 558 streamEntry.waitGroup.Done() 559 }