github.com/ydb-platform/ydb-go-sdk/v3@v3.89.2/internal/topic/topicreaderinternal/stream_reconnector.go (about) 1 package topicreaderinternal 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "runtime" 8 "sync" 9 "time" 10 11 "github.com/jonboulle/clockwork" 12 13 "github.com/ydb-platform/ydb-go-sdk/v3/internal/background" 14 "github.com/ydb-platform/ydb-go-sdk/v3/internal/backoff" 15 "github.com/ydb-platform/ydb-go-sdk/v3/internal/empty" 16 "github.com/ydb-platform/ydb-go-sdk/v3/internal/topic" 17 "github.com/ydb-platform/ydb-go-sdk/v3/internal/topic/topicreadercommon" 18 "github.com/ydb-platform/ydb-go-sdk/v3/internal/tx" 19 "github.com/ydb-platform/ydb-go-sdk/v3/internal/value" 20 "github.com/ydb-platform/ydb-go-sdk/v3/internal/xcontext" 21 "github.com/ydb-platform/ydb-go-sdk/v3/internal/xerrors" 22 "github.com/ydb-platform/ydb-go-sdk/v3/internal/xsync" 23 "github.com/ydb-platform/ydb-go-sdk/v3/trace" 24 ) 25 26 var ( 27 errReconnectRequestOutdated = xerrors.Wrap(errors.New("ydb: reconnect request outdated")) 28 errReconnect = xerrors.Wrap(errors.New("ydb: reconnect to topic grpc stream")) 29 errConnectionTimeout = xerrors.Wrap(errors.New("ydb: topic reader connection timeout for stream")) 30 ) 31 32 type readerConnectFunc func(ctx context.Context) (batchedStreamReader, error) 33 34 type readerReconnector struct { 35 background background.Worker 36 clock clockwork.Clock 37 retrySettings topic.RetrySettings 38 streamVal batchedStreamReader 39 streamContextCancel context.CancelCauseFunc 40 streamErr error 41 initErr error 42 tracer *trace.Topic 43 readerConnect readerConnectFunc 44 reconnectFromBadStream chan reconnectRequest 45 connectTimeout time.Duration 46 readerID int64 47 streamConnectionInProgress empty.Chan // opened if connection in progress, closed if connection established 48 initDoneCh empty.Chan 49 m xsync.RWMutex 50 closeOnce sync.Once 51 initDone bool 52 } 53 54 func newReaderReconnector( 55 readerID int64, 56 connector readerConnectFunc, 57 connectTimeout time.Duration, 58 retrySettings topic.RetrySettings, 59 tracer *trace.Topic, 60 ) *readerReconnector { 61 res := &readerReconnector{ 62 readerID: readerID, 63 clock: clockwork.NewRealClock(), 64 readerConnect: connector, 65 streamErr: errUnconnected, 66 connectTimeout: connectTimeout, 67 tracer: tracer, 68 retrySettings: retrySettings, 69 } 70 71 if res.connectTimeout == 0 { 72 res.connectTimeout = value.InfiniteDuration 73 } 74 75 res.initChannelsAndClock() 76 res.start() 77 78 return res 79 } 80 81 func (r *readerReconnector) PopMessagesBatchTx( 82 ctx context.Context, 83 tx tx.Transaction, 84 opts ReadMessageBatchOptions, 85 ) ( 86 *topicreadercommon.PublicBatch, 87 error, 88 ) { 89 return r.readWithReconnections( 90 ctx, 91 func( 92 ctx context.Context, 93 stream batchedStreamReader, 94 ) ( 95 *topicreadercommon.PublicBatch, 96 error, 97 ) { 98 return stream.PopMessagesBatchTx(ctx, tx, opts) 99 }, 100 ) 101 } 102 103 func (r *readerReconnector) ReadMessageBatch( 104 ctx context.Context, 105 opts ReadMessageBatchOptions, 106 ) ( 107 *topicreadercommon.PublicBatch, 108 error, 109 ) { 110 return r.readWithReconnections( 111 ctx, 112 func( 113 ctx context.Context, 114 stream batchedStreamReader, 115 ) ( 116 *topicreadercommon.PublicBatch, 117 error, 118 ) { 119 return stream.ReadMessageBatch(ctx, opts) 120 }, 121 ) 122 } 123 124 func (r *readerReconnector) readWithReconnections( 125 ctx context.Context, 126 read func( 127 ctx context.Context, 128 stream batchedStreamReader, 129 ) (*topicreadercommon.PublicBatch, error), 130 ) ( 131 *topicreadercommon.PublicBatch, 132 error, 133 ) { 134 if ctx.Err() != nil { 135 return nil, ctx.Err() 136 } 137 138 attempt := 0 139 140 for { 141 if attempt > 0 { 142 if err := func() error { 143 t := r.clock.NewTimer(backoff.Fast.Delay(attempt)) 144 defer t.Stop() 145 146 select { 147 case <-ctx.Done(): 148 return ctx.Err() 149 case <-t.Chan(): 150 return nil 151 } 152 }(); err != nil { 153 return nil, err 154 } 155 } 156 157 attempt++ 158 stream, err := r.stream(ctx) 159 switch { 160 case r.isRetriableError(err): 161 r.fireReconnectOnRetryableError(stream, err) 162 runtime.Gosched() 163 164 continue 165 case err != nil: 166 return nil, err 167 default: 168 // pass 169 } 170 171 res, err := read(ctx, stream) 172 if r.isRetriableError(err) { 173 r.fireReconnectOnRetryableError(stream, err) 174 runtime.Gosched() 175 176 continue 177 } 178 179 return res, err 180 } 181 } 182 183 func (r *readerReconnector) Commit( 184 ctx context.Context, 185 commitRange topicreadercommon.CommitRange, 186 ) error { 187 stream, err := r.stream(ctx) 188 if err != nil { 189 return err 190 } 191 192 err = stream.Commit(ctx, commitRange) 193 r.fireReconnectOnRetryableError(stream, err) 194 195 return err 196 } 197 198 func (r *readerReconnector) CloseWithError(ctx context.Context, reason error) error { 199 var closeErr error 200 r.closeOnce.Do(func() { 201 closeErr = r.background.Close(ctx, reason) 202 203 if r.streamVal != nil { 204 streamCloseErr := r.streamVal.CloseWithError(ctx, xerrors.WithStackTrace(errReaderClosed)) 205 r.streamContextCancel(errReaderClosed) 206 if closeErr == nil { 207 closeErr = streamCloseErr 208 } 209 } 210 211 r.m.WithLock(func() { 212 if !r.initDone { 213 r.initErr = reason 214 close(r.initDoneCh) 215 } 216 }) 217 }) 218 219 return closeErr 220 } 221 222 func (r *readerReconnector) start() { 223 r.background.Start("reconnector-loop", r.reconnectionLoop) 224 225 // start first connection 226 r.reconnectFromBadStream <- newReconnectRequest(nil, nil) 227 } 228 229 func (r *readerReconnector) initChannelsAndClock() { 230 if r.clock == nil { 231 r.clock = clockwork.NewRealClock() 232 } 233 r.reconnectFromBadStream = make(chan reconnectRequest, 1) 234 r.streamConnectionInProgress = make(empty.Chan) 235 r.initDoneCh = make(empty.Chan) 236 close(r.streamConnectionInProgress) // no progress at start 237 } 238 239 func (r *readerReconnector) reconnectionLoop(ctx context.Context) { 240 defer r.handlePanic() 241 242 var retriesStarted time.Time 243 lastTime := time.Time{} 244 attempt := 0 245 for { 246 now := r.clock.Now() 247 if topic.CheckResetReconnectionCounters(lastTime, now, r.connectTimeout) { 248 attempt = 0 249 retriesStarted = time.Now() 250 } else { 251 attempt++ 252 } 253 lastTime = now 254 255 var request reconnectRequest 256 select { 257 case <-ctx.Done(): 258 return 259 260 case request = <-r.reconnectFromBadStream: 261 if retriesStarted.IsZero() { 262 retriesStarted = time.Now() 263 } 264 } 265 266 onReconnectionDone := trace.TopicOnReaderReconnect(r.tracer, request.reason) 267 268 if request.reason != nil { 269 retryBackoff, stopRetryReason := r.checkErrRetryMode( 270 request.reason, 271 r.clock.Since(retriesStarted), 272 ) 273 if stopRetryReason == nil { 274 if err := func() error { 275 t := r.clock.NewTimer(retryBackoff.Delay(attempt)) 276 defer t.Stop() 277 278 select { 279 case <-ctx.Done(): 280 return ctx.Err() 281 case <-t.Chan(): 282 return nil 283 } 284 }(); err != nil { 285 return 286 } 287 } else { 288 _ = r.CloseWithError(ctx, stopRetryReason) 289 onReconnectionDone(stopRetryReason) 290 291 return 292 } 293 } 294 295 err := r.reconnect(ctx, request.reason, request.oldReader) 296 onReconnectionDone(err) 297 } 298 } 299 300 //nolint:funlen 301 func (r *readerReconnector) reconnect(ctx context.Context, reason error, oldReader batchedStreamReader) (err error) { 302 onDone := trace.TopicOnReaderReconnect(r.tracer, reason) 303 defer func() { onDone(err) }() 304 305 if err = ctx.Err(); err != nil { 306 return err 307 } 308 309 var closedErr error 310 r.m.WithRLock(func() { 311 closedErr = r.background.CloseReason() 312 }) 313 if closedErr != nil { 314 return err 315 } 316 317 if stream, _ := r.stream(ctx); oldReader != stream { 318 return xerrors.WithStackTrace(errReconnectRequestOutdated) 319 } 320 321 connectionInProgress := make(empty.Chan) 322 defer close(connectionInProgress) 323 324 r.m.WithLock(func() { 325 r.streamConnectionInProgress = connectionInProgress 326 }) 327 328 if oldReader != nil { 329 _ = oldReader.CloseWithError(ctx, xerrors.WithStackTrace(errReconnect)) 330 } 331 332 newStream, newStreamClose, err := r.connectWithTimeout() 333 334 switch { 335 case err == nil: 336 // pass 337 case r.isRetriableError(err): 338 sendReason := err 339 r.background.Start("ydb topic reader send reconnect message", func(ctx context.Context) { 340 select { 341 case r.reconnectFromBadStream <- newReconnectRequest(oldReader, sendReason): 342 trace.TopicOnReaderReconnectRequest(r.tracer, err, true) 343 case <-ctx.Done(): 344 trace.TopicOnReaderReconnectRequest(r.tracer, ctx.Err(), false) 345 } 346 }) 347 default: 348 // unretriable error 349 _ = r.CloseWithError(ctx, err) 350 } 351 352 r.m.WithLock(func() { 353 r.streamErr = err 354 if err == nil { 355 r.streamVal = newStream 356 r.streamContextCancel = newStreamClose 357 if !r.initDone { 358 r.initDone = true 359 close(r.initDoneCh) 360 } 361 } 362 }) 363 364 return err 365 } 366 367 func (r *readerReconnector) isRetriableError(err error) bool { 368 _, stopReason := topic.RetryDecision(err, r.retrySettings, 0) 369 370 return stopReason == nil 371 } 372 373 func (r *readerReconnector) checkErrRetryMode(err error, retriesDuration time.Duration) ( 374 backoffType backoff.Backoff, 375 stopRetryReason error, 376 ) { 377 return topic.RetryDecision(err, r.retrySettings, retriesDuration) 378 } 379 380 func (r *readerReconnector) connectWithTimeout() (_ batchedStreamReader, _ context.CancelCauseFunc, err error) { 381 bgContext := r.background.Context() 382 383 if err = bgContext.Err(); err != nil { 384 return nil, nil, err 385 } 386 387 connectionContext, cancel := context.WithCancelCause(xcontext.ValueOnly(bgContext)) 388 389 type connectResult struct { 390 stream batchedStreamReader 391 err error 392 } 393 result := make(chan connectResult, 1) 394 395 go func() { 396 stream, err := r.readerConnect(connectionContext) 397 result <- connectResult{stream: stream, err: err} 398 }() 399 400 connectionTimoutTimer := r.clock.NewTimer(r.connectTimeout) 401 defer connectionTimoutTimer.Stop() 402 403 var res connectResult 404 select { 405 case <-connectionTimoutTimer.Chan(): 406 // cancel connection context only if timeout exceed while connection 407 // because if cancel context after connect - it will break 408 cancel(xerrors.WithStackTrace(errConnectionTimeout)) 409 res = <-result 410 case res = <-result: 411 // pass 412 } 413 414 if res.err == nil { 415 return res.stream, cancel, nil 416 } 417 418 return nil, nil, res.err 419 } 420 421 func (r *readerReconnector) WaitInit(ctx context.Context) error { 422 if ctx.Err() != nil { 423 return ctx.Err() 424 } 425 426 select { 427 case <-ctx.Done(): 428 return ctx.Err() 429 case <-r.initDoneCh: 430 return r.initErr 431 case <-r.background.Done(): 432 return r.background.CloseReason() 433 } 434 } 435 436 func (r *readerReconnector) fireReconnectOnRetryableError(stream batchedStreamReader, err error) { 437 if !r.isRetriableError(err) { 438 return 439 } 440 441 select { 442 case r.reconnectFromBadStream <- newReconnectRequest(stream, err): 443 // send signal 444 trace.TopicOnReaderReconnectRequest(r.tracer, err, true) 445 default: 446 // previous reconnect signal in process, no need sent signal more 447 trace.TopicOnReaderReconnectRequest(r.tracer, err, false) 448 } 449 } 450 451 func (r *readerReconnector) stream(ctx context.Context) (batchedStreamReader, error) { 452 if err := ctx.Err(); err != nil { 453 return nil, err 454 } 455 456 var err error 457 var connectionChan empty.Chan 458 r.m.WithRLock(func() { 459 connectionChan = r.streamConnectionInProgress 460 err = r.background.CloseReason() 461 }) 462 if err != nil { 463 return nil, err 464 } 465 466 select { 467 case <-ctx.Done(): 468 return nil, ctx.Err() 469 case <-r.background.Done(): 470 return nil, r.background.CloseReason() 471 case <-connectionChan: 472 var reader batchedStreamReader 473 r.m.WithRLock(func() { 474 reader = r.streamVal 475 err = r.streamErr 476 }) 477 r.fireReconnectOnRetryableError(reader, err) 478 479 return reader, err 480 } 481 } 482 483 func (r *readerReconnector) handlePanic() { 484 if p := recover(); p != nil { 485 _ = r.CloseWithError(context.Background(), xerrors.WithStackTrace(fmt.Errorf("handled panic: %v", p))) 486 } 487 } 488 489 type reconnectRequest struct { 490 oldReader batchedStreamReader 491 reason error 492 } 493 494 func newReconnectRequest(oldReader batchedStreamReader, reason error) reconnectRequest { 495 return reconnectRequest{ 496 oldReader: oldReader, 497 reason: reason, 498 } 499 }