github.com/ydb-platform/ydb-go-sdk/v3@v3.89.2/internal/topic/topicreaderinternal/stream_reconnector.go (about)

     1  package topicreaderinternal
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"runtime"
     8  	"sync"
     9  	"time"
    10  
    11  	"github.com/jonboulle/clockwork"
    12  
    13  	"github.com/ydb-platform/ydb-go-sdk/v3/internal/background"
    14  	"github.com/ydb-platform/ydb-go-sdk/v3/internal/backoff"
    15  	"github.com/ydb-platform/ydb-go-sdk/v3/internal/empty"
    16  	"github.com/ydb-platform/ydb-go-sdk/v3/internal/topic"
    17  	"github.com/ydb-platform/ydb-go-sdk/v3/internal/topic/topicreadercommon"
    18  	"github.com/ydb-platform/ydb-go-sdk/v3/internal/tx"
    19  	"github.com/ydb-platform/ydb-go-sdk/v3/internal/value"
    20  	"github.com/ydb-platform/ydb-go-sdk/v3/internal/xcontext"
    21  	"github.com/ydb-platform/ydb-go-sdk/v3/internal/xerrors"
    22  	"github.com/ydb-platform/ydb-go-sdk/v3/internal/xsync"
    23  	"github.com/ydb-platform/ydb-go-sdk/v3/trace"
    24  )
    25  
    26  var (
    27  	errReconnectRequestOutdated = xerrors.Wrap(errors.New("ydb: reconnect request outdated"))
    28  	errReconnect                = xerrors.Wrap(errors.New("ydb: reconnect to topic grpc stream"))
    29  	errConnectionTimeout        = xerrors.Wrap(errors.New("ydb: topic reader connection timeout for stream"))
    30  )
    31  
    32  type readerConnectFunc func(ctx context.Context) (batchedStreamReader, error)
    33  
    34  type readerReconnector struct {
    35  	background                 background.Worker
    36  	clock                      clockwork.Clock
    37  	retrySettings              topic.RetrySettings
    38  	streamVal                  batchedStreamReader
    39  	streamContextCancel        context.CancelCauseFunc
    40  	streamErr                  error
    41  	initErr                    error
    42  	tracer                     *trace.Topic
    43  	readerConnect              readerConnectFunc
    44  	reconnectFromBadStream     chan reconnectRequest
    45  	connectTimeout             time.Duration
    46  	readerID                   int64
    47  	streamConnectionInProgress empty.Chan // opened if connection in progress, closed if connection established
    48  	initDoneCh                 empty.Chan
    49  	m                          xsync.RWMutex
    50  	closeOnce                  sync.Once
    51  	initDone                   bool
    52  }
    53  
    54  func newReaderReconnector(
    55  	readerID int64,
    56  	connector readerConnectFunc,
    57  	connectTimeout time.Duration,
    58  	retrySettings topic.RetrySettings,
    59  	tracer *trace.Topic,
    60  ) *readerReconnector {
    61  	res := &readerReconnector{
    62  		readerID:       readerID,
    63  		clock:          clockwork.NewRealClock(),
    64  		readerConnect:  connector,
    65  		streamErr:      errUnconnected,
    66  		connectTimeout: connectTimeout,
    67  		tracer:         tracer,
    68  		retrySettings:  retrySettings,
    69  	}
    70  
    71  	if res.connectTimeout == 0 {
    72  		res.connectTimeout = value.InfiniteDuration
    73  	}
    74  
    75  	res.initChannelsAndClock()
    76  	res.start()
    77  
    78  	return res
    79  }
    80  
    81  func (r *readerReconnector) PopMessagesBatchTx(
    82  	ctx context.Context,
    83  	tx tx.Transaction,
    84  	opts ReadMessageBatchOptions,
    85  ) (
    86  	*topicreadercommon.PublicBatch,
    87  	error,
    88  ) {
    89  	return r.readWithReconnections(
    90  		ctx,
    91  		func(
    92  			ctx context.Context,
    93  			stream batchedStreamReader,
    94  		) (
    95  			*topicreadercommon.PublicBatch,
    96  			error,
    97  		) {
    98  			return stream.PopMessagesBatchTx(ctx, tx, opts)
    99  		},
   100  	)
   101  }
   102  
   103  func (r *readerReconnector) ReadMessageBatch(
   104  	ctx context.Context,
   105  	opts ReadMessageBatchOptions,
   106  ) (
   107  	*topicreadercommon.PublicBatch,
   108  	error,
   109  ) {
   110  	return r.readWithReconnections(
   111  		ctx,
   112  		func(
   113  			ctx context.Context,
   114  			stream batchedStreamReader,
   115  		) (
   116  			*topicreadercommon.PublicBatch,
   117  			error,
   118  		) {
   119  			return stream.ReadMessageBatch(ctx, opts)
   120  		},
   121  	)
   122  }
   123  
   124  func (r *readerReconnector) readWithReconnections(
   125  	ctx context.Context,
   126  	read func(
   127  		ctx context.Context,
   128  		stream batchedStreamReader,
   129  	) (*topicreadercommon.PublicBatch, error),
   130  ) (
   131  	*topicreadercommon.PublicBatch,
   132  	error,
   133  ) {
   134  	if ctx.Err() != nil {
   135  		return nil, ctx.Err()
   136  	}
   137  
   138  	attempt := 0
   139  
   140  	for {
   141  		if attempt > 0 {
   142  			if err := func() error {
   143  				t := r.clock.NewTimer(backoff.Fast.Delay(attempt))
   144  				defer t.Stop()
   145  
   146  				select {
   147  				case <-ctx.Done():
   148  					return ctx.Err()
   149  				case <-t.Chan():
   150  					return nil
   151  				}
   152  			}(); err != nil {
   153  				return nil, err
   154  			}
   155  		}
   156  
   157  		attempt++
   158  		stream, err := r.stream(ctx)
   159  		switch {
   160  		case r.isRetriableError(err):
   161  			r.fireReconnectOnRetryableError(stream, err)
   162  			runtime.Gosched()
   163  
   164  			continue
   165  		case err != nil:
   166  			return nil, err
   167  		default:
   168  			// pass
   169  		}
   170  
   171  		res, err := read(ctx, stream)
   172  		if r.isRetriableError(err) {
   173  			r.fireReconnectOnRetryableError(stream, err)
   174  			runtime.Gosched()
   175  
   176  			continue
   177  		}
   178  
   179  		return res, err
   180  	}
   181  }
   182  
   183  func (r *readerReconnector) Commit(
   184  	ctx context.Context,
   185  	commitRange topicreadercommon.CommitRange,
   186  ) error {
   187  	stream, err := r.stream(ctx)
   188  	if err != nil {
   189  		return err
   190  	}
   191  
   192  	err = stream.Commit(ctx, commitRange)
   193  	r.fireReconnectOnRetryableError(stream, err)
   194  
   195  	return err
   196  }
   197  
   198  func (r *readerReconnector) CloseWithError(ctx context.Context, reason error) error {
   199  	var closeErr error
   200  	r.closeOnce.Do(func() {
   201  		closeErr = r.background.Close(ctx, reason)
   202  
   203  		if r.streamVal != nil {
   204  			streamCloseErr := r.streamVal.CloseWithError(ctx, xerrors.WithStackTrace(errReaderClosed))
   205  			r.streamContextCancel(errReaderClosed)
   206  			if closeErr == nil {
   207  				closeErr = streamCloseErr
   208  			}
   209  		}
   210  
   211  		r.m.WithLock(func() {
   212  			if !r.initDone {
   213  				r.initErr = reason
   214  				close(r.initDoneCh)
   215  			}
   216  		})
   217  	})
   218  
   219  	return closeErr
   220  }
   221  
   222  func (r *readerReconnector) start() {
   223  	r.background.Start("reconnector-loop", r.reconnectionLoop)
   224  
   225  	// start first connection
   226  	r.reconnectFromBadStream <- newReconnectRequest(nil, nil)
   227  }
   228  
   229  func (r *readerReconnector) initChannelsAndClock() {
   230  	if r.clock == nil {
   231  		r.clock = clockwork.NewRealClock()
   232  	}
   233  	r.reconnectFromBadStream = make(chan reconnectRequest, 1)
   234  	r.streamConnectionInProgress = make(empty.Chan)
   235  	r.initDoneCh = make(empty.Chan)
   236  	close(r.streamConnectionInProgress) // no progress at start
   237  }
   238  
   239  func (r *readerReconnector) reconnectionLoop(ctx context.Context) {
   240  	defer r.handlePanic()
   241  
   242  	var retriesStarted time.Time
   243  	lastTime := time.Time{}
   244  	attempt := 0
   245  	for {
   246  		now := r.clock.Now()
   247  		if topic.CheckResetReconnectionCounters(lastTime, now, r.connectTimeout) {
   248  			attempt = 0
   249  			retriesStarted = time.Now()
   250  		} else {
   251  			attempt++
   252  		}
   253  		lastTime = now
   254  
   255  		var request reconnectRequest
   256  		select {
   257  		case <-ctx.Done():
   258  			return
   259  
   260  		case request = <-r.reconnectFromBadStream:
   261  			if retriesStarted.IsZero() {
   262  				retriesStarted = time.Now()
   263  			}
   264  		}
   265  
   266  		onReconnectionDone := trace.TopicOnReaderReconnect(r.tracer, request.reason)
   267  
   268  		if request.reason != nil {
   269  			retryBackoff, stopRetryReason := r.checkErrRetryMode(
   270  				request.reason,
   271  				r.clock.Since(retriesStarted),
   272  			)
   273  			if stopRetryReason == nil {
   274  				if err := func() error {
   275  					t := r.clock.NewTimer(retryBackoff.Delay(attempt))
   276  					defer t.Stop()
   277  
   278  					select {
   279  					case <-ctx.Done():
   280  						return ctx.Err()
   281  					case <-t.Chan():
   282  						return nil
   283  					}
   284  				}(); err != nil {
   285  					return
   286  				}
   287  			} else {
   288  				_ = r.CloseWithError(ctx, stopRetryReason)
   289  				onReconnectionDone(stopRetryReason)
   290  
   291  				return
   292  			}
   293  		}
   294  
   295  		err := r.reconnect(ctx, request.reason, request.oldReader)
   296  		onReconnectionDone(err)
   297  	}
   298  }
   299  
   300  //nolint:funlen
   301  func (r *readerReconnector) reconnect(ctx context.Context, reason error, oldReader batchedStreamReader) (err error) {
   302  	onDone := trace.TopicOnReaderReconnect(r.tracer, reason)
   303  	defer func() { onDone(err) }()
   304  
   305  	if err = ctx.Err(); err != nil {
   306  		return err
   307  	}
   308  
   309  	var closedErr error
   310  	r.m.WithRLock(func() {
   311  		closedErr = r.background.CloseReason()
   312  	})
   313  	if closedErr != nil {
   314  		return err
   315  	}
   316  
   317  	if stream, _ := r.stream(ctx); oldReader != stream {
   318  		return xerrors.WithStackTrace(errReconnectRequestOutdated)
   319  	}
   320  
   321  	connectionInProgress := make(empty.Chan)
   322  	defer close(connectionInProgress)
   323  
   324  	r.m.WithLock(func() {
   325  		r.streamConnectionInProgress = connectionInProgress
   326  	})
   327  
   328  	if oldReader != nil {
   329  		_ = oldReader.CloseWithError(ctx, xerrors.WithStackTrace(errReconnect))
   330  	}
   331  
   332  	newStream, newStreamClose, err := r.connectWithTimeout()
   333  
   334  	switch {
   335  	case err == nil:
   336  		// pass
   337  	case r.isRetriableError(err):
   338  		sendReason := err
   339  		r.background.Start("ydb topic reader send reconnect message", func(ctx context.Context) {
   340  			select {
   341  			case r.reconnectFromBadStream <- newReconnectRequest(oldReader, sendReason):
   342  				trace.TopicOnReaderReconnectRequest(r.tracer, err, true)
   343  			case <-ctx.Done():
   344  				trace.TopicOnReaderReconnectRequest(r.tracer, ctx.Err(), false)
   345  			}
   346  		})
   347  	default:
   348  		// unretriable error
   349  		_ = r.CloseWithError(ctx, err)
   350  	}
   351  
   352  	r.m.WithLock(func() {
   353  		r.streamErr = err
   354  		if err == nil {
   355  			r.streamVal = newStream
   356  			r.streamContextCancel = newStreamClose
   357  			if !r.initDone {
   358  				r.initDone = true
   359  				close(r.initDoneCh)
   360  			}
   361  		}
   362  	})
   363  
   364  	return err
   365  }
   366  
   367  func (r *readerReconnector) isRetriableError(err error) bool {
   368  	_, stopReason := topic.RetryDecision(err, r.retrySettings, 0)
   369  
   370  	return stopReason == nil
   371  }
   372  
   373  func (r *readerReconnector) checkErrRetryMode(err error, retriesDuration time.Duration) (
   374  	backoffType backoff.Backoff,
   375  	stopRetryReason error,
   376  ) {
   377  	return topic.RetryDecision(err, r.retrySettings, retriesDuration)
   378  }
   379  
   380  func (r *readerReconnector) connectWithTimeout() (_ batchedStreamReader, _ context.CancelCauseFunc, err error) {
   381  	bgContext := r.background.Context()
   382  
   383  	if err = bgContext.Err(); err != nil {
   384  		return nil, nil, err
   385  	}
   386  
   387  	connectionContext, cancel := context.WithCancelCause(xcontext.ValueOnly(bgContext))
   388  
   389  	type connectResult struct {
   390  		stream batchedStreamReader
   391  		err    error
   392  	}
   393  	result := make(chan connectResult, 1)
   394  
   395  	go func() {
   396  		stream, err := r.readerConnect(connectionContext)
   397  		result <- connectResult{stream: stream, err: err}
   398  	}()
   399  
   400  	connectionTimoutTimer := r.clock.NewTimer(r.connectTimeout)
   401  	defer connectionTimoutTimer.Stop()
   402  
   403  	var res connectResult
   404  	select {
   405  	case <-connectionTimoutTimer.Chan():
   406  		// cancel connection context only if timeout exceed while connection
   407  		// because if cancel context after connect - it will break
   408  		cancel(xerrors.WithStackTrace(errConnectionTimeout))
   409  		res = <-result
   410  	case res = <-result:
   411  		// pass
   412  	}
   413  
   414  	if res.err == nil {
   415  		return res.stream, cancel, nil
   416  	}
   417  
   418  	return nil, nil, res.err
   419  }
   420  
   421  func (r *readerReconnector) WaitInit(ctx context.Context) error {
   422  	if ctx.Err() != nil {
   423  		return ctx.Err()
   424  	}
   425  
   426  	select {
   427  	case <-ctx.Done():
   428  		return ctx.Err()
   429  	case <-r.initDoneCh:
   430  		return r.initErr
   431  	case <-r.background.Done():
   432  		return r.background.CloseReason()
   433  	}
   434  }
   435  
   436  func (r *readerReconnector) fireReconnectOnRetryableError(stream batchedStreamReader, err error) {
   437  	if !r.isRetriableError(err) {
   438  		return
   439  	}
   440  
   441  	select {
   442  	case r.reconnectFromBadStream <- newReconnectRequest(stream, err):
   443  		// send signal
   444  		trace.TopicOnReaderReconnectRequest(r.tracer, err, true)
   445  	default:
   446  		// previous reconnect signal in process, no need sent signal more
   447  		trace.TopicOnReaderReconnectRequest(r.tracer, err, false)
   448  	}
   449  }
   450  
   451  func (r *readerReconnector) stream(ctx context.Context) (batchedStreamReader, error) {
   452  	if err := ctx.Err(); err != nil {
   453  		return nil, err
   454  	}
   455  
   456  	var err error
   457  	var connectionChan empty.Chan
   458  	r.m.WithRLock(func() {
   459  		connectionChan = r.streamConnectionInProgress
   460  		err = r.background.CloseReason()
   461  	})
   462  	if err != nil {
   463  		return nil, err
   464  	}
   465  
   466  	select {
   467  	case <-ctx.Done():
   468  		return nil, ctx.Err()
   469  	case <-r.background.Done():
   470  		return nil, r.background.CloseReason()
   471  	case <-connectionChan:
   472  		var reader batchedStreamReader
   473  		r.m.WithRLock(func() {
   474  			reader = r.streamVal
   475  			err = r.streamErr
   476  		})
   477  		r.fireReconnectOnRetryableError(reader, err)
   478  
   479  		return reader, err
   480  	}
   481  }
   482  
   483  func (r *readerReconnector) handlePanic() {
   484  	if p := recover(); p != nil {
   485  		_ = r.CloseWithError(context.Background(), xerrors.WithStackTrace(fmt.Errorf("handled panic: %v", p)))
   486  	}
   487  }
   488  
   489  type reconnectRequest struct {
   490  	oldReader batchedStreamReader
   491  	reason    error
   492  }
   493  
   494  func newReconnectRequest(oldReader batchedStreamReader, reason error) reconnectRequest {
   495  	return reconnectRequest{
   496  		oldReader: oldReader,
   497  		reason:    reason,
   498  	}
   499  }