github.com/ydb-platform/ydb-go-sdk/v3@v3.57.0/internal/topic/topicreaderinternal/stream_reconnector.go (about)

     1  package topicreaderinternal
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"runtime"
     8  	"sync"
     9  	"time"
    10  
    11  	"github.com/jonboulle/clockwork"
    12  
    13  	"github.com/ydb-platform/ydb-go-sdk/v3/internal/background"
    14  	"github.com/ydb-platform/ydb-go-sdk/v3/internal/backoff"
    15  	"github.com/ydb-platform/ydb-go-sdk/v3/internal/empty"
    16  	"github.com/ydb-platform/ydb-go-sdk/v3/internal/topic"
    17  	"github.com/ydb-platform/ydb-go-sdk/v3/internal/value"
    18  	"github.com/ydb-platform/ydb-go-sdk/v3/internal/xcontext"
    19  	"github.com/ydb-platform/ydb-go-sdk/v3/internal/xerrors"
    20  	"github.com/ydb-platform/ydb-go-sdk/v3/internal/xsync"
    21  	"github.com/ydb-platform/ydb-go-sdk/v3/trace"
    22  )
    23  
    24  var (
    25  	errReconnectRequestOutdated = xerrors.Wrap(errors.New("ydb: reconnect request outdated"))
    26  	errReconnect                = xerrors.Wrap(errors.New("ydb: reconnect to topic grpc stream"))
    27  )
    28  
    29  type readerConnectFunc func(ctx context.Context) (batchedStreamReader, error)
    30  
    31  type readerReconnector struct {
    32  	background                 background.Worker
    33  	clock                      clockwork.Clock
    34  	baseContext                context.Context
    35  	retrySettings              topic.RetrySettings
    36  	streamVal                  batchedStreamReader
    37  	streamErr                  error
    38  	closedErr                  error
    39  	initErr                    error
    40  	tracer                     *trace.Topic
    41  	readerConnect              readerConnectFunc
    42  	reconnectFromBadStream     chan reconnectRequest
    43  	connectTimeout             time.Duration
    44  	readerID                   int64
    45  	streamConnectionInProgress empty.Chan // opened if connection in progress, closed if connection established
    46  	initDoneCh                 empty.Chan
    47  	m                          xsync.RWMutex
    48  	closeOnce                  sync.Once
    49  	initDone                   bool
    50  }
    51  
    52  //nolint:revive
    53  func newReaderReconnector(
    54  	readerID int64,
    55  	connector readerConnectFunc,
    56  	connectTimeout time.Duration,
    57  	retrySettings topic.RetrySettings,
    58  	tracer *trace.Topic,
    59  	baseContext context.Context,
    60  ) *readerReconnector {
    61  	res := &readerReconnector{
    62  		readerID:       readerID,
    63  		clock:          clockwork.NewRealClock(),
    64  		readerConnect:  connector,
    65  		streamErr:      errUnconnected,
    66  		connectTimeout: connectTimeout,
    67  		tracer:         tracer,
    68  		baseContext:    baseContext,
    69  		retrySettings:  retrySettings,
    70  	}
    71  
    72  	if res.connectTimeout == 0 {
    73  		res.connectTimeout = value.InfiniteDuration
    74  	}
    75  
    76  	res.initChannelsAndClock()
    77  	res.start()
    78  
    79  	return res
    80  }
    81  
    82  func (r *readerReconnector) ReadMessageBatch(ctx context.Context, opts ReadMessageBatchOptions) (*PublicBatch, error) {
    83  	if ctx.Err() != nil {
    84  		return nil, ctx.Err()
    85  	}
    86  
    87  	attempt := 0
    88  
    89  	for {
    90  		if attempt > 0 {
    91  			if err := func() error {
    92  				t := r.clock.NewTimer(backoff.Fast.Delay(attempt))
    93  				defer t.Stop()
    94  
    95  				select {
    96  				case <-ctx.Done():
    97  					return ctx.Err()
    98  				case <-t.Chan():
    99  					return nil
   100  				}
   101  			}(); err != nil {
   102  				return nil, err
   103  			}
   104  		}
   105  
   106  		attempt++
   107  		stream, err := r.stream(ctx)
   108  		switch {
   109  		case r.isRetriableError(err):
   110  			r.fireReconnectOnRetryableError(stream, err)
   111  			runtime.Gosched()
   112  
   113  			continue
   114  		case err != nil:
   115  			return nil, err
   116  		default:
   117  			// pass
   118  		}
   119  
   120  		res, err := stream.ReadMessageBatch(ctx, opts)
   121  		if r.isRetriableError(err) {
   122  			r.fireReconnectOnRetryableError(stream, err)
   123  			runtime.Gosched()
   124  
   125  			continue
   126  		}
   127  
   128  		return res, err
   129  	}
   130  }
   131  
   132  func (r *readerReconnector) Commit(ctx context.Context, commitRange commitRange) error {
   133  	stream, err := r.stream(ctx)
   134  	if err != nil {
   135  		return err
   136  	}
   137  
   138  	err = stream.Commit(ctx, commitRange)
   139  	r.fireReconnectOnRetryableError(stream, err)
   140  
   141  	return err
   142  }
   143  
   144  func (r *readerReconnector) CloseWithError(ctx context.Context, err error) error {
   145  	var closeErr error
   146  	r.closeOnce.Do(func() {
   147  		r.m.WithLock(func() {
   148  			r.closedErr = err
   149  		})
   150  
   151  		closeErr = r.background.Close(ctx, err)
   152  
   153  		if r.streamVal != nil {
   154  			streamCloseErr := r.streamVal.CloseWithError(ctx, xerrors.WithStackTrace(errReaderClosed))
   155  			if closeErr == nil {
   156  				closeErr = streamCloseErr
   157  			}
   158  		}
   159  
   160  		r.m.WithLock(func() {
   161  			if !r.initDone {
   162  				r.initErr = closeErr
   163  				close(r.initDoneCh)
   164  			}
   165  		})
   166  	})
   167  
   168  	return closeErr
   169  }
   170  
   171  func (r *readerReconnector) start() {
   172  	r.background.Start("reconnector-loop", r.reconnectionLoop)
   173  
   174  	// start first connection
   175  	r.reconnectFromBadStream <- newReconnectRequest(nil, nil)
   176  }
   177  
   178  func (r *readerReconnector) initChannelsAndClock() {
   179  	if r.clock == nil {
   180  		r.clock = clockwork.NewRealClock()
   181  	}
   182  	r.reconnectFromBadStream = make(chan reconnectRequest, 1)
   183  	r.streamConnectionInProgress = make(empty.Chan)
   184  	r.initDoneCh = make(empty.Chan)
   185  	close(r.streamConnectionInProgress) // no progress at start
   186  }
   187  
   188  func (r *readerReconnector) reconnectionLoop(ctx context.Context) {
   189  	defer r.handlePanic()
   190  
   191  	var retriesStarted time.Time
   192  	lastTime := time.Time{}
   193  	attempt := 0
   194  	for {
   195  		now := r.clock.Now()
   196  		if topic.CheckResetReconnectionCounters(lastTime, now, r.connectTimeout) {
   197  			attempt = 0
   198  			retriesStarted = time.Now()
   199  		} else {
   200  			attempt++
   201  		}
   202  		lastTime = now
   203  
   204  		var request reconnectRequest
   205  		select {
   206  		case <-ctx.Done():
   207  			return
   208  
   209  		case request = <-r.reconnectFromBadStream:
   210  			if retriesStarted.IsZero() {
   211  				retriesStarted = time.Now()
   212  			}
   213  		}
   214  
   215  		if request.reason != nil {
   216  			if retryBackoff, isRetriableErr := r.checkErrRetryMode(
   217  				request.reason,
   218  				r.clock.Since(retriesStarted),
   219  			); isRetriableErr {
   220  				if err := func() error {
   221  					t := r.clock.NewTimer(retryBackoff.Delay(attempt))
   222  					defer t.Stop()
   223  
   224  					select {
   225  					case <-ctx.Done():
   226  						return ctx.Err()
   227  					case <-t.Chan():
   228  						return nil
   229  					}
   230  				}(); err != nil {
   231  					return
   232  				}
   233  			}
   234  		}
   235  
   236  		_ = r.reconnect(ctx, request.reason, request.oldReader)
   237  	}
   238  }
   239  
   240  func (r *readerReconnector) reconnect(ctx context.Context, reason error, oldReader batchedStreamReader) (err error) {
   241  	onDone := trace.TopicOnReaderReconnect(r.tracer, reason)
   242  	defer func() {
   243  		onDone(err)
   244  	}()
   245  
   246  	if err = ctx.Err(); err != nil {
   247  		return err
   248  	}
   249  
   250  	var closedErr error
   251  	r.m.WithRLock(func() {
   252  		closedErr = r.closedErr
   253  	})
   254  	if closedErr != nil {
   255  		return err
   256  	}
   257  
   258  	stream, _ := r.stream(ctx)
   259  	if oldReader != stream {
   260  		return xerrors.WithStackTrace(errReconnectRequestOutdated)
   261  	}
   262  
   263  	connectionInProgress := make(empty.Chan)
   264  	defer close(connectionInProgress)
   265  
   266  	r.m.WithLock(func() {
   267  		r.streamConnectionInProgress = connectionInProgress
   268  	})
   269  
   270  	if oldReader != nil {
   271  		_ = oldReader.CloseWithError(ctx, xerrors.WithStackTrace(errReconnect))
   272  	}
   273  
   274  	newStream, err := r.connectWithTimeout()
   275  
   276  	if r.isRetriableError(err) {
   277  		go func(reason error) {
   278  			// guarantee write reconnect signal to channel
   279  			r.reconnectFromBadStream <- newReconnectRequest(oldReader, reason)
   280  			trace.TopicOnReaderReconnectRequest(r.tracer, err, true)
   281  		}(err)
   282  	}
   283  
   284  	r.m.WithLock(func() {
   285  		r.streamErr = err
   286  		if err == nil {
   287  			r.streamVal = newStream
   288  			if !r.initDone {
   289  				r.initDone = true
   290  				close(r.initDoneCh)
   291  			}
   292  		}
   293  	})
   294  
   295  	return err
   296  }
   297  
   298  func (r *readerReconnector) isRetriableError(err error) bool {
   299  	_, res := topic.CheckRetryMode(err, r.retrySettings, 0)
   300  
   301  	return res
   302  }
   303  
   304  func (r *readerReconnector) checkErrRetryMode(err error, retriesDuration time.Duration) (
   305  	backoffType backoff.Backoff,
   306  	isRetriableErr bool,
   307  ) {
   308  	return topic.CheckRetryMode(err, r.retrySettings, retriesDuration)
   309  }
   310  
   311  func (r *readerReconnector) connectWithTimeout() (_ batchedStreamReader, err error) {
   312  	bgContext := r.background.Context()
   313  
   314  	if err = bgContext.Err(); err != nil {
   315  		return nil, err
   316  	}
   317  
   318  	connectionContext, cancel := xcontext.WithCancel(context.Background())
   319  
   320  	type connectResult struct {
   321  		stream batchedStreamReader
   322  		err    error
   323  	}
   324  	result := make(chan connectResult, 1)
   325  
   326  	go func() {
   327  		stream, err := r.readerConnect(connectionContext)
   328  		result <- connectResult{stream: stream, err: err}
   329  	}()
   330  
   331  	var res connectResult
   332  	select {
   333  	case <-r.clock.After(r.connectTimeout):
   334  		// cancel connection context only if timeout exceed while connection
   335  		// because if cancel context after connect - it will break
   336  		cancel()
   337  		res = <-result
   338  	case res = <-result:
   339  		// pass
   340  	}
   341  
   342  	if res.err == nil {
   343  		return res.stream, nil
   344  	}
   345  
   346  	return nil, res.err
   347  }
   348  
   349  func (r *readerReconnector) WaitInit(ctx context.Context) error {
   350  	if ctx.Err() != nil {
   351  		return ctx.Err()
   352  	}
   353  
   354  	select {
   355  	case <-ctx.Done():
   356  		return ctx.Err()
   357  	case <-r.initDoneCh:
   358  		return r.initErr
   359  	}
   360  }
   361  
   362  func (r *readerReconnector) fireReconnectOnRetryableError(stream batchedStreamReader, err error) {
   363  	if !r.isRetriableError(err) {
   364  		return
   365  	}
   366  
   367  	select {
   368  	case r.reconnectFromBadStream <- newReconnectRequest(stream, err):
   369  		// send signal
   370  		trace.TopicOnReaderReconnectRequest(r.tracer, err, true)
   371  	default:
   372  		// previous reconnect signal in process, no need sent signal more
   373  		trace.TopicOnReaderReconnectRequest(r.tracer, err, false)
   374  	}
   375  }
   376  
   377  func (r *readerReconnector) stream(ctx context.Context) (batchedStreamReader, error) {
   378  	if err := ctx.Err(); err != nil {
   379  		return nil, err
   380  	}
   381  
   382  	var err error
   383  	var connectionChan empty.Chan
   384  	r.m.WithRLock(func() {
   385  		connectionChan = r.streamConnectionInProgress
   386  		if r.closedErr != nil {
   387  			err = r.closedErr
   388  
   389  			return
   390  		}
   391  	})
   392  	if err != nil {
   393  		return nil, err
   394  	}
   395  
   396  	select {
   397  	case <-ctx.Done():
   398  		return nil, ctx.Err()
   399  	case <-r.background.Done():
   400  		return nil, r.closedErr
   401  	case <-connectionChan:
   402  		var reader batchedStreamReader
   403  		r.m.WithRLock(func() {
   404  			reader = r.streamVal
   405  			err = r.streamErr
   406  		})
   407  		r.fireReconnectOnRetryableError(reader, err)
   408  
   409  		return reader, err
   410  	}
   411  }
   412  
   413  func (r *readerReconnector) handlePanic() {
   414  	p := recover()
   415  
   416  	if p != nil {
   417  		_ = r.CloseWithError(context.Background(), xerrors.WithStackTrace(fmt.Errorf("handled panic: %v", p)))
   418  	}
   419  }
   420  
   421  type reconnectRequest struct {
   422  	oldReader batchedStreamReader
   423  	reason    error
   424  }
   425  
   426  func newReconnectRequest(oldReader batchedStreamReader, reason error) reconnectRequest {
   427  	return reconnectRequest{
   428  		oldReader: oldReader,
   429  		reason:    reason,
   430  	}
   431  }