github.com/ydb-platform/ydb-go-sdk/v3@v3.57.0/internal/topic/topicreaderinternal/stream_reconnector.go (about) 1 package topicreaderinternal 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "runtime" 8 "sync" 9 "time" 10 11 "github.com/jonboulle/clockwork" 12 13 "github.com/ydb-platform/ydb-go-sdk/v3/internal/background" 14 "github.com/ydb-platform/ydb-go-sdk/v3/internal/backoff" 15 "github.com/ydb-platform/ydb-go-sdk/v3/internal/empty" 16 "github.com/ydb-platform/ydb-go-sdk/v3/internal/topic" 17 "github.com/ydb-platform/ydb-go-sdk/v3/internal/value" 18 "github.com/ydb-platform/ydb-go-sdk/v3/internal/xcontext" 19 "github.com/ydb-platform/ydb-go-sdk/v3/internal/xerrors" 20 "github.com/ydb-platform/ydb-go-sdk/v3/internal/xsync" 21 "github.com/ydb-platform/ydb-go-sdk/v3/trace" 22 ) 23 24 var ( 25 errReconnectRequestOutdated = xerrors.Wrap(errors.New("ydb: reconnect request outdated")) 26 errReconnect = xerrors.Wrap(errors.New("ydb: reconnect to topic grpc stream")) 27 ) 28 29 type readerConnectFunc func(ctx context.Context) (batchedStreamReader, error) 30 31 type readerReconnector struct { 32 background background.Worker 33 clock clockwork.Clock 34 baseContext context.Context 35 retrySettings topic.RetrySettings 36 streamVal batchedStreamReader 37 streamErr error 38 closedErr error 39 initErr error 40 tracer *trace.Topic 41 readerConnect readerConnectFunc 42 reconnectFromBadStream chan reconnectRequest 43 connectTimeout time.Duration 44 readerID int64 45 streamConnectionInProgress empty.Chan // opened if connection in progress, closed if connection established 46 initDoneCh empty.Chan 47 m xsync.RWMutex 48 closeOnce sync.Once 49 initDone bool 50 } 51 52 //nolint:revive 53 func newReaderReconnector( 54 readerID int64, 55 connector readerConnectFunc, 56 connectTimeout time.Duration, 57 retrySettings topic.RetrySettings, 58 tracer *trace.Topic, 59 baseContext context.Context, 60 ) *readerReconnector { 61 res := &readerReconnector{ 62 readerID: readerID, 63 clock: clockwork.NewRealClock(), 64 readerConnect: connector, 65 streamErr: errUnconnected, 66 connectTimeout: connectTimeout, 67 tracer: tracer, 68 baseContext: baseContext, 69 retrySettings: retrySettings, 70 } 71 72 if res.connectTimeout == 0 { 73 res.connectTimeout = value.InfiniteDuration 74 } 75 76 res.initChannelsAndClock() 77 res.start() 78 79 return res 80 } 81 82 func (r *readerReconnector) ReadMessageBatch(ctx context.Context, opts ReadMessageBatchOptions) (*PublicBatch, error) { 83 if ctx.Err() != nil { 84 return nil, ctx.Err() 85 } 86 87 attempt := 0 88 89 for { 90 if attempt > 0 { 91 if err := func() error { 92 t := r.clock.NewTimer(backoff.Fast.Delay(attempt)) 93 defer t.Stop() 94 95 select { 96 case <-ctx.Done(): 97 return ctx.Err() 98 case <-t.Chan(): 99 return nil 100 } 101 }(); err != nil { 102 return nil, err 103 } 104 } 105 106 attempt++ 107 stream, err := r.stream(ctx) 108 switch { 109 case r.isRetriableError(err): 110 r.fireReconnectOnRetryableError(stream, err) 111 runtime.Gosched() 112 113 continue 114 case err != nil: 115 return nil, err 116 default: 117 // pass 118 } 119 120 res, err := stream.ReadMessageBatch(ctx, opts) 121 if r.isRetriableError(err) { 122 r.fireReconnectOnRetryableError(stream, err) 123 runtime.Gosched() 124 125 continue 126 } 127 128 return res, err 129 } 130 } 131 132 func (r *readerReconnector) Commit(ctx context.Context, commitRange commitRange) error { 133 stream, err := r.stream(ctx) 134 if err != nil { 135 return err 136 } 137 138 err = stream.Commit(ctx, commitRange) 139 r.fireReconnectOnRetryableError(stream, err) 140 141 return err 142 } 143 144 func (r *readerReconnector) CloseWithError(ctx context.Context, err error) error { 145 var closeErr error 146 r.closeOnce.Do(func() { 147 r.m.WithLock(func() { 148 r.closedErr = err 149 }) 150 151 closeErr = r.background.Close(ctx, err) 152 153 if r.streamVal != nil { 154 streamCloseErr := r.streamVal.CloseWithError(ctx, xerrors.WithStackTrace(errReaderClosed)) 155 if closeErr == nil { 156 closeErr = streamCloseErr 157 } 158 } 159 160 r.m.WithLock(func() { 161 if !r.initDone { 162 r.initErr = closeErr 163 close(r.initDoneCh) 164 } 165 }) 166 }) 167 168 return closeErr 169 } 170 171 func (r *readerReconnector) start() { 172 r.background.Start("reconnector-loop", r.reconnectionLoop) 173 174 // start first connection 175 r.reconnectFromBadStream <- newReconnectRequest(nil, nil) 176 } 177 178 func (r *readerReconnector) initChannelsAndClock() { 179 if r.clock == nil { 180 r.clock = clockwork.NewRealClock() 181 } 182 r.reconnectFromBadStream = make(chan reconnectRequest, 1) 183 r.streamConnectionInProgress = make(empty.Chan) 184 r.initDoneCh = make(empty.Chan) 185 close(r.streamConnectionInProgress) // no progress at start 186 } 187 188 func (r *readerReconnector) reconnectionLoop(ctx context.Context) { 189 defer r.handlePanic() 190 191 var retriesStarted time.Time 192 lastTime := time.Time{} 193 attempt := 0 194 for { 195 now := r.clock.Now() 196 if topic.CheckResetReconnectionCounters(lastTime, now, r.connectTimeout) { 197 attempt = 0 198 retriesStarted = time.Now() 199 } else { 200 attempt++ 201 } 202 lastTime = now 203 204 var request reconnectRequest 205 select { 206 case <-ctx.Done(): 207 return 208 209 case request = <-r.reconnectFromBadStream: 210 if retriesStarted.IsZero() { 211 retriesStarted = time.Now() 212 } 213 } 214 215 if request.reason != nil { 216 if retryBackoff, isRetriableErr := r.checkErrRetryMode( 217 request.reason, 218 r.clock.Since(retriesStarted), 219 ); isRetriableErr { 220 if err := func() error { 221 t := r.clock.NewTimer(retryBackoff.Delay(attempt)) 222 defer t.Stop() 223 224 select { 225 case <-ctx.Done(): 226 return ctx.Err() 227 case <-t.Chan(): 228 return nil 229 } 230 }(); err != nil { 231 return 232 } 233 } 234 } 235 236 _ = r.reconnect(ctx, request.reason, request.oldReader) 237 } 238 } 239 240 func (r *readerReconnector) reconnect(ctx context.Context, reason error, oldReader batchedStreamReader) (err error) { 241 onDone := trace.TopicOnReaderReconnect(r.tracer, reason) 242 defer func() { 243 onDone(err) 244 }() 245 246 if err = ctx.Err(); err != nil { 247 return err 248 } 249 250 var closedErr error 251 r.m.WithRLock(func() { 252 closedErr = r.closedErr 253 }) 254 if closedErr != nil { 255 return err 256 } 257 258 stream, _ := r.stream(ctx) 259 if oldReader != stream { 260 return xerrors.WithStackTrace(errReconnectRequestOutdated) 261 } 262 263 connectionInProgress := make(empty.Chan) 264 defer close(connectionInProgress) 265 266 r.m.WithLock(func() { 267 r.streamConnectionInProgress = connectionInProgress 268 }) 269 270 if oldReader != nil { 271 _ = oldReader.CloseWithError(ctx, xerrors.WithStackTrace(errReconnect)) 272 } 273 274 newStream, err := r.connectWithTimeout() 275 276 if r.isRetriableError(err) { 277 go func(reason error) { 278 // guarantee write reconnect signal to channel 279 r.reconnectFromBadStream <- newReconnectRequest(oldReader, reason) 280 trace.TopicOnReaderReconnectRequest(r.tracer, err, true) 281 }(err) 282 } 283 284 r.m.WithLock(func() { 285 r.streamErr = err 286 if err == nil { 287 r.streamVal = newStream 288 if !r.initDone { 289 r.initDone = true 290 close(r.initDoneCh) 291 } 292 } 293 }) 294 295 return err 296 } 297 298 func (r *readerReconnector) isRetriableError(err error) bool { 299 _, res := topic.CheckRetryMode(err, r.retrySettings, 0) 300 301 return res 302 } 303 304 func (r *readerReconnector) checkErrRetryMode(err error, retriesDuration time.Duration) ( 305 backoffType backoff.Backoff, 306 isRetriableErr bool, 307 ) { 308 return topic.CheckRetryMode(err, r.retrySettings, retriesDuration) 309 } 310 311 func (r *readerReconnector) connectWithTimeout() (_ batchedStreamReader, err error) { 312 bgContext := r.background.Context() 313 314 if err = bgContext.Err(); err != nil { 315 return nil, err 316 } 317 318 connectionContext, cancel := xcontext.WithCancel(context.Background()) 319 320 type connectResult struct { 321 stream batchedStreamReader 322 err error 323 } 324 result := make(chan connectResult, 1) 325 326 go func() { 327 stream, err := r.readerConnect(connectionContext) 328 result <- connectResult{stream: stream, err: err} 329 }() 330 331 var res connectResult 332 select { 333 case <-r.clock.After(r.connectTimeout): 334 // cancel connection context only if timeout exceed while connection 335 // because if cancel context after connect - it will break 336 cancel() 337 res = <-result 338 case res = <-result: 339 // pass 340 } 341 342 if res.err == nil { 343 return res.stream, nil 344 } 345 346 return nil, res.err 347 } 348 349 func (r *readerReconnector) WaitInit(ctx context.Context) error { 350 if ctx.Err() != nil { 351 return ctx.Err() 352 } 353 354 select { 355 case <-ctx.Done(): 356 return ctx.Err() 357 case <-r.initDoneCh: 358 return r.initErr 359 } 360 } 361 362 func (r *readerReconnector) fireReconnectOnRetryableError(stream batchedStreamReader, err error) { 363 if !r.isRetriableError(err) { 364 return 365 } 366 367 select { 368 case r.reconnectFromBadStream <- newReconnectRequest(stream, err): 369 // send signal 370 trace.TopicOnReaderReconnectRequest(r.tracer, err, true) 371 default: 372 // previous reconnect signal in process, no need sent signal more 373 trace.TopicOnReaderReconnectRequest(r.tracer, err, false) 374 } 375 } 376 377 func (r *readerReconnector) stream(ctx context.Context) (batchedStreamReader, error) { 378 if err := ctx.Err(); err != nil { 379 return nil, err 380 } 381 382 var err error 383 var connectionChan empty.Chan 384 r.m.WithRLock(func() { 385 connectionChan = r.streamConnectionInProgress 386 if r.closedErr != nil { 387 err = r.closedErr 388 389 return 390 } 391 }) 392 if err != nil { 393 return nil, err 394 } 395 396 select { 397 case <-ctx.Done(): 398 return nil, ctx.Err() 399 case <-r.background.Done(): 400 return nil, r.closedErr 401 case <-connectionChan: 402 var reader batchedStreamReader 403 r.m.WithRLock(func() { 404 reader = r.streamVal 405 err = r.streamErr 406 }) 407 r.fireReconnectOnRetryableError(reader, err) 408 409 return reader, err 410 } 411 } 412 413 func (r *readerReconnector) handlePanic() { 414 p := recover() 415 416 if p != nil { 417 _ = r.CloseWithError(context.Background(), xerrors.WithStackTrace(fmt.Errorf("handled panic: %v", p))) 418 } 419 } 420 421 type reconnectRequest struct { 422 oldReader batchedStreamReader 423 reason error 424 } 425 426 func newReconnectRequest(oldReader batchedStreamReader, reason error) reconnectRequest { 427 return reconnectRequest{ 428 oldReader: oldReader, 429 reason: reason, 430 } 431 }