github.com/keybase/client/go@v0.0.0-20241007131713-f10651d043c8/chat/retry.go (about)

     1  package chat
     2  
     3  import (
     4  	"encoding/hex"
     5  	"errors"
     6  	"fmt"
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/keybase/client/go/chat/globals"
    11  	"github.com/keybase/client/go/chat/types"
    12  	"github.com/keybase/client/go/chat/utils"
    13  	"github.com/keybase/client/go/protocol/chat1"
    14  	"github.com/keybase/client/go/protocol/gregor1"
    15  	"github.com/keybase/clockwork"
    16  	"github.com/keybase/go-codec/codec"
    17  	context "golang.org/x/net/context"
    18  )
    19  
    20  type FetchType int
    21  
    22  const (
    23  	InboxLoad FetchType = iota
    24  	ThreadLoad
    25  	FullInboxLoad
    26  )
    27  
    28  const fetchInitialInterval = 3 * time.Second
    29  const fetchMultiplier = 1.5
    30  const fetchMaxAttempts = 100
    31  
    32  type ConversationRetry struct {
    33  	globals.Contextified
    34  	utils.DebugLabeler
    35  
    36  	convID chat1.ConversationID
    37  	tlfID  *chat1.TLFID
    38  	kind   FetchType
    39  }
    40  
    41  var _ types.RetryDescription = (*ConversationRetry)(nil)
    42  
    43  func NewConversationRetry(g *globals.Context, convID chat1.ConversationID, tlfID *chat1.TLFID, kind FetchType) *ConversationRetry {
    44  	dstr := fmt.Sprintf("ConversationRetry(%s,%v)", convID, kind)
    45  	return &ConversationRetry{
    46  		Contextified: globals.NewContextified(g),
    47  		DebugLabeler: utils.NewDebugLabeler(g.ExternalG(), dstr, false),
    48  		convID:       convID,
    49  		tlfID:        tlfID,
    50  		kind:         kind,
    51  	}
    52  }
    53  
    54  func (c *ConversationRetry) String() string {
    55  	return fmt.Sprintf("%s:%v", c.convID, c.kind)
    56  }
    57  
    58  func (c *ConversationRetry) RekeyFixable(ctx context.Context, tlfID chat1.TLFID) bool {
    59  	return c.tlfID != nil && c.tlfID.Eq(tlfID)
    60  }
    61  
    62  func (c *ConversationRetry) SendStale(ctx context.Context, uid gregor1.UID) {
    63  	supdates := []chat1.ConversationStaleUpdate{{
    64  		ConvID:     c.convID,
    65  		UpdateType: chat1.StaleUpdateType_NEWACTIVITY,
    66  	}}
    67  	c.G().Syncer.SendChatStaleNotifications(ctx, uid, supdates, false)
    68  }
    69  
    70  func (c *ConversationRetry) Fix(ctx context.Context, uid gregor1.UID) error {
    71  	if c.kind == ThreadLoad {
    72  		return c.fixThreadFetch(ctx, uid)
    73  	}
    74  	return c.fixInboxFetch(ctx, uid)
    75  }
    76  
    77  func (c *ConversationRetry) fixInboxFetch(ctx context.Context, uid gregor1.UID) error {
    78  	c.Debug(ctx, "fixInboxFetch: retrying conversation")
    79  
    80  	// Reload this conversation and hope it works
    81  	inbox, _, err := c.G().InboxSource.Read(ctx, uid, types.ConversationLocalizerBlocking,
    82  		types.InboxSourceDataSourceAll, nil,
    83  		&chat1.GetInboxLocalQuery{
    84  			ConvIDs: []chat1.ConversationID{c.convID},
    85  		})
    86  	if err != nil {
    87  		c.Debug(ctx, "fixInboxFetch: failed to read inbox: msg: %s", err.Error())
    88  		return err
    89  	}
    90  	if len(inbox.Convs) != 1 {
    91  		c.Debug(ctx, "fixInboxFetch: unusual number of results for Read call: len: %d", len(inbox.Convs))
    92  		return errors.New("inbox fetch failed: unusual number of conversation returned")
    93  	}
    94  	conv := inbox.Convs[0]
    95  
    96  	if conv.Error == nil {
    97  		c.Debug(ctx, "fixInboxFetch: fixed convID: %s", conv.GetConvID())
    98  		return nil
    99  	}
   100  	c.Debug(ctx, "fixInboxFetch: convID failed again: msg: %s typ: %v",
   101  		conv.Error.Message, conv.Error.Typ)
   102  
   103  	return fmt.Errorf("inbox fetch failed: %s", conv.Error.Message)
   104  }
   105  
   106  func (c *ConversationRetry) fixThreadFetch(ctx context.Context, uid gregor1.UID) error {
   107  	c.Debug(ctx, "fixThreadFetch: retrying conversation")
   108  	// Attempt a pull of 50 messages to simulate whatever request got the
   109  	// conversation in this queue.
   110  	_, err := c.G().ConvSource.Pull(ctx, c.convID, uid, chat1.GetThreadReason_FIXRETRY, nil, nil,
   111  		&chat1.Pagination{
   112  			Num: 50,
   113  		})
   114  	if err == nil {
   115  		c.Debug(ctx, "fixThreadFetch: fixed")
   116  		return nil
   117  	}
   118  
   119  	c.Debug(ctx, "fixThreadFetch: convID failed again: msg: %s", err.Error())
   120  	return err
   121  }
   122  
   123  type FullInboxRetry struct {
   124  	globals.Contextified
   125  	utils.DebugLabeler
   126  
   127  	query *chat1.GetInboxLocalQuery
   128  }
   129  
   130  var _ types.RetryDescription = (*FullInboxRetry)(nil)
   131  
   132  func NewFullInboxRetry(g *globals.Context, query *chat1.GetInboxLocalQuery) FullInboxRetry {
   133  	return FullInboxRetry{
   134  		Contextified: globals.NewContextified(g),
   135  		DebugLabeler: utils.NewDebugLabeler(g.ExternalG(), "FullInboxRetry", false),
   136  		query:        query,
   137  	}
   138  }
   139  
   140  func (f FullInboxRetry) String() string {
   141  	qstr := "<empty>"
   142  	if f.query != nil {
   143  		mh := codec.MsgpackHandle{WriteExt: true}
   144  		var data []byte
   145  		enc := codec.NewEncoderBytes(&data, &mh)
   146  		err := enc.Encode(*f.query)
   147  		if err != nil {
   148  			panic(err)
   149  		}
   150  		qstr = hex.EncodeToString(data)
   151  	}
   152  	pstr := "<empty>"
   153  	return qstr + pstr
   154  }
   155  
   156  func (f FullInboxRetry) RekeyFixable(ctx context.Context, tlfID chat1.TLFID) bool {
   157  	return false
   158  }
   159  
   160  func (f FullInboxRetry) SendStale(ctx context.Context, uid gregor1.UID) {
   161  	f.G().Syncer.SendChatStaleNotifications(ctx, uid, nil, true)
   162  }
   163  
   164  func (f FullInboxRetry) Fix(ctx context.Context, uid gregor1.UID) error {
   165  	query, _, err := f.G().InboxSource.GetInboxQueryLocalToRemote(ctx, f.query)
   166  	if err != nil {
   167  		f.Debug(ctx, "Fix: failed to convert query: %s", err.Error())
   168  		return err
   169  	}
   170  	_, err = f.G().InboxSource.ReadUnverified(ctx, uid, types.InboxSourceDataSourceAll, query)
   171  	if err != nil {
   172  		f.Debug(ctx, "Fix: failed to load again: %d", err.Error())
   173  	}
   174  	return nil
   175  }
   176  
   177  type retrierControl struct {
   178  	desc       types.RetryDescription
   179  	forceCh    chan struct{}
   180  	shutdownCh chan struct{}
   181  }
   182  
   183  func newRetrierControl(desc types.RetryDescription) *retrierControl {
   184  	return &retrierControl{
   185  		desc:       desc,
   186  		forceCh:    make(chan struct{}, 1),
   187  		shutdownCh: make(chan struct{}, 1),
   188  	}
   189  }
   190  
   191  func (c *retrierControl) Shutdown() {
   192  	select {
   193  	case c.shutdownCh <- struct{}{}:
   194  	default:
   195  	}
   196  }
   197  
   198  func (c *retrierControl) Force() {
   199  	select {
   200  	case c.forceCh <- struct{}{}:
   201  	default:
   202  	}
   203  }
   204  
   205  // FetchRetrier is responsible for tracking any nonblock fetch failures, and retrying
   206  // them automatically.
   207  type FetchRetrier struct {
   208  	globals.Contextified
   209  	utils.DebugLabeler
   210  	sync.Mutex
   211  
   212  	retriers         map[string]*retrierControl
   213  	clock            clockwork.Clock
   214  	offline, running bool
   215  }
   216  
   217  var _ types.FetchRetrier = (*FetchRetrier)(nil)
   218  
   219  func NewFetchRetrier(g *globals.Context) *FetchRetrier {
   220  	f := &FetchRetrier{
   221  		Contextified: globals.NewContextified(g),
   222  		DebugLabeler: utils.NewDebugLabeler(g.ExternalG(), "FetchRetrier", false),
   223  		clock:        clockwork.NewRealClock(),
   224  		retriers:     make(map[string]*retrierControl),
   225  	}
   226  	return f
   227  }
   228  
   229  // SetClock sets a custom clock for testing.
   230  func (f *FetchRetrier) SetClock(clock clockwork.Clock) {
   231  	f.clock = clock
   232  }
   233  
   234  func (f *FetchRetrier) key(uid gregor1.UID, desc types.RetryDescription) string {
   235  	return fmt.Sprintf("%s:%s", uid, desc)
   236  }
   237  
   238  // nextAttemptTime calculates the next try for a given retry item. It uses an exponential
   239  // decay calculation.
   240  func (f *FetchRetrier) nextAttemptTime(attempts int, lastAttempt time.Time) time.Time {
   241  	wait := time.Duration(float64(attempts) * fetchMultiplier * float64(fetchInitialInterval))
   242  	return lastAttempt.Add(wait)
   243  }
   244  
   245  func (f *FetchRetrier) spawnRetrier(ctx context.Context, uid gregor1.UID, desc types.RetryDescription,
   246  	control *retrierControl) {
   247  
   248  	attempts := 1
   249  	nextTime := f.nextAttemptTime(attempts, f.clock.Now())
   250  	ctx = globals.BackgroundChatCtx(ctx, f.G())
   251  	go func() {
   252  		for {
   253  			select {
   254  			case <-f.clock.AfterTime(nextTime):
   255  				// Only attempts if we are online. Otherwise just retry
   256  				// at the same interval that we used last time.
   257  				if !f.offline {
   258  					f.Debug(ctx, "spawnRetrier: retrying after time: desc: %s", desc)
   259  					if err := desc.Fix(ctx, uid); err == nil {
   260  						f.Lock()
   261  						delete(f.retriers, f.key(uid, desc))
   262  						f.Unlock()
   263  						desc.SendStale(ctx, uid)
   264  						return
   265  					}
   266  				}
   267  			case <-control.forceCh:
   268  				f.Debug(ctx, "spawnRetrier: retrying (forced): desc: %s", desc)
   269  				if err := desc.Fix(ctx, uid); err == nil {
   270  					f.Lock()
   271  					delete(f.retriers, f.key(uid, desc))
   272  					f.Unlock()
   273  					desc.SendStale(ctx, uid)
   274  					return
   275  				}
   276  			case <-control.shutdownCh:
   277  				f.Lock()
   278  				defer f.Unlock()
   279  				f.Debug(ctx, "spawnRetrier: shutdown received, going down: desc: %s", desc)
   280  				delete(f.retriers, f.key(uid, desc))
   281  				return
   282  			}
   283  
   284  			attempts++
   285  			if attempts > fetchMaxAttempts {
   286  				f.Debug(ctx, "spawnRetrier: max attempts reached, bailing: desc: %s", desc)
   287  				control.Shutdown()
   288  			}
   289  			nextTime = f.nextAttemptTime(attempts, f.clock.Now())
   290  			f.Debug(ctx, "spawnRetrier: attempts: %d next: %v desc: %s", attempts, nextTime, desc)
   291  		}
   292  	}()
   293  }
   294  
   295  // Failure indicates a failure of type kind has happened when loading a conversation.
   296  func (f *FetchRetrier) Failure(ctx context.Context, uid gregor1.UID, desc types.RetryDescription) {
   297  	defer f.Trace(ctx, nil, fmt.Sprintf("Failure(%s)", desc))()
   298  	f.Lock()
   299  	defer f.Unlock()
   300  	if !f.running {
   301  		f.Debug(ctx, "Failure: not starting new retrier, not running")
   302  		return
   303  	}
   304  	key := f.key(uid, desc)
   305  	if _, ok := f.retriers[key]; !ok {
   306  		f.Debug(ctx, "Failure: spawning new retrier: desc: %s", desc)
   307  		control := newRetrierControl(desc)
   308  		f.retriers[key] = control
   309  		f.spawnRetrier(ctx, uid, desc, control)
   310  	}
   311  }
   312  
   313  // Success indicates a success of type kind loading a conversation. This effectively removes
   314  // that conversation from the retry queue.
   315  func (f *FetchRetrier) Success(ctx context.Context, uid gregor1.UID, desc types.RetryDescription) {
   316  	defer f.Trace(ctx, nil, fmt.Sprintf("Success(%s)", desc))()
   317  	f.Lock()
   318  	defer f.Unlock()
   319  	key := f.key(uid, desc)
   320  	if control, ok := f.retriers[key]; ok {
   321  		control.Shutdown()
   322  	}
   323  }
   324  
   325  // Connected is called when a connection to the chat server is established, and forces a
   326  // pass over the retry queue
   327  func (f *FetchRetrier) Connected(ctx context.Context) {
   328  	defer f.Trace(ctx, nil, "Connected")()
   329  	f.Lock()
   330  	defer f.Unlock()
   331  	f.offline = false
   332  	for _, control := range f.retriers {
   333  		control.Force()
   334  	}
   335  }
   336  
   337  // Disconnected is called when we lose connection to the chat server, and pauses attempts
   338  // on the retry queue.
   339  func (f *FetchRetrier) Disconnected(ctx context.Context) {
   340  	f.Lock()
   341  	defer f.Unlock()
   342  	f.offline = true
   343  }
   344  
   345  // IsOffline returns if the module thinks we are connected to the chat server.
   346  func (f *FetchRetrier) IsOffline(ctx context.Context) bool {
   347  	f.Lock()
   348  	defer f.Unlock()
   349  	return f.offline
   350  }
   351  
   352  // Force forces a run of the retry loop.
   353  func (f *FetchRetrier) Force(ctx context.Context) {
   354  	defer f.Trace(ctx, nil, "Force")()
   355  	f.Lock()
   356  	defer f.Unlock()
   357  	for _, control := range f.retriers {
   358  		control.Force()
   359  	}
   360  }
   361  
   362  func (f *FetchRetrier) Rekey(ctx context.Context, name string, membersType chat1.ConversationMembersType,
   363  	public bool) {
   364  	nameInfo, err := CreateNameInfoSource(ctx, f.G(), membersType).LookupID(ctx, name, public)
   365  	if err != nil {
   366  		f.Debug(ctx, "Rekey: failed to load name info for: %s msg %s", name, err)
   367  		return
   368  	}
   369  	var forces []*retrierControl
   370  	f.Lock()
   371  	for _, control := range f.retriers {
   372  		if control.desc.RekeyFixable(ctx, nameInfo.ID) {
   373  			forces = append(forces, control)
   374  		}
   375  	}
   376  	f.Unlock()
   377  	for _, force := range forces {
   378  		f.Debug(ctx, "Rekey: forcing: %s", force.desc)
   379  		force.Force()
   380  	}
   381  }
   382  
   383  func (f *FetchRetrier) Stop(ctx context.Context) chan struct{} {
   384  	defer f.Trace(ctx, nil, "Shutdown")()
   385  	f.Lock()
   386  	defer f.Unlock()
   387  	f.running = false
   388  	for _, control := range f.retriers {
   389  		control.Shutdown()
   390  	}
   391  	ch := make(chan struct{})
   392  	close(ch)
   393  	return ch
   394  }
   395  
   396  func (f *FetchRetrier) Start(ctx context.Context, uid gregor1.UID) {
   397  	f.Lock()
   398  	defer f.Unlock()
   399  	f.running = true
   400  }