github.com/keybase/client/go@v0.0.0-20241007131713-f10651d043c8/chat/retry.go (about) 1 package chat 2 3 import ( 4 "encoding/hex" 5 "errors" 6 "fmt" 7 "sync" 8 "time" 9 10 "github.com/keybase/client/go/chat/globals" 11 "github.com/keybase/client/go/chat/types" 12 "github.com/keybase/client/go/chat/utils" 13 "github.com/keybase/client/go/protocol/chat1" 14 "github.com/keybase/client/go/protocol/gregor1" 15 "github.com/keybase/clockwork" 16 "github.com/keybase/go-codec/codec" 17 context "golang.org/x/net/context" 18 ) 19 20 type FetchType int 21 22 const ( 23 InboxLoad FetchType = iota 24 ThreadLoad 25 FullInboxLoad 26 ) 27 28 const fetchInitialInterval = 3 * time.Second 29 const fetchMultiplier = 1.5 30 const fetchMaxAttempts = 100 31 32 type ConversationRetry struct { 33 globals.Contextified 34 utils.DebugLabeler 35 36 convID chat1.ConversationID 37 tlfID *chat1.TLFID 38 kind FetchType 39 } 40 41 var _ types.RetryDescription = (*ConversationRetry)(nil) 42 43 func NewConversationRetry(g *globals.Context, convID chat1.ConversationID, tlfID *chat1.TLFID, kind FetchType) *ConversationRetry { 44 dstr := fmt.Sprintf("ConversationRetry(%s,%v)", convID, kind) 45 return &ConversationRetry{ 46 Contextified: globals.NewContextified(g), 47 DebugLabeler: utils.NewDebugLabeler(g.ExternalG(), dstr, false), 48 convID: convID, 49 tlfID: tlfID, 50 kind: kind, 51 } 52 } 53 54 func (c *ConversationRetry) String() string { 55 return fmt.Sprintf("%s:%v", c.convID, c.kind) 56 } 57 58 func (c *ConversationRetry) RekeyFixable(ctx context.Context, tlfID chat1.TLFID) bool { 59 return c.tlfID != nil && c.tlfID.Eq(tlfID) 60 } 61 62 func (c *ConversationRetry) SendStale(ctx context.Context, uid gregor1.UID) { 63 supdates := []chat1.ConversationStaleUpdate{{ 64 ConvID: c.convID, 65 UpdateType: chat1.StaleUpdateType_NEWACTIVITY, 66 }} 67 c.G().Syncer.SendChatStaleNotifications(ctx, uid, supdates, false) 68 } 69 70 func (c *ConversationRetry) Fix(ctx context.Context, uid gregor1.UID) error { 71 if c.kind == ThreadLoad { 72 return c.fixThreadFetch(ctx, uid) 73 } 74 return c.fixInboxFetch(ctx, uid) 75 } 76 77 func (c *ConversationRetry) fixInboxFetch(ctx context.Context, uid gregor1.UID) error { 78 c.Debug(ctx, "fixInboxFetch: retrying conversation") 79 80 // Reload this conversation and hope it works 81 inbox, _, err := c.G().InboxSource.Read(ctx, uid, types.ConversationLocalizerBlocking, 82 types.InboxSourceDataSourceAll, nil, 83 &chat1.GetInboxLocalQuery{ 84 ConvIDs: []chat1.ConversationID{c.convID}, 85 }) 86 if err != nil { 87 c.Debug(ctx, "fixInboxFetch: failed to read inbox: msg: %s", err.Error()) 88 return err 89 } 90 if len(inbox.Convs) != 1 { 91 c.Debug(ctx, "fixInboxFetch: unusual number of results for Read call: len: %d", len(inbox.Convs)) 92 return errors.New("inbox fetch failed: unusual number of conversation returned") 93 } 94 conv := inbox.Convs[0] 95 96 if conv.Error == nil { 97 c.Debug(ctx, "fixInboxFetch: fixed convID: %s", conv.GetConvID()) 98 return nil 99 } 100 c.Debug(ctx, "fixInboxFetch: convID failed again: msg: %s typ: %v", 101 conv.Error.Message, conv.Error.Typ) 102 103 return fmt.Errorf("inbox fetch failed: %s", conv.Error.Message) 104 } 105 106 func (c *ConversationRetry) fixThreadFetch(ctx context.Context, uid gregor1.UID) error { 107 c.Debug(ctx, "fixThreadFetch: retrying conversation") 108 // Attempt a pull of 50 messages to simulate whatever request got the 109 // conversation in this queue. 110 _, err := c.G().ConvSource.Pull(ctx, c.convID, uid, chat1.GetThreadReason_FIXRETRY, nil, nil, 111 &chat1.Pagination{ 112 Num: 50, 113 }) 114 if err == nil { 115 c.Debug(ctx, "fixThreadFetch: fixed") 116 return nil 117 } 118 119 c.Debug(ctx, "fixThreadFetch: convID failed again: msg: %s", err.Error()) 120 return err 121 } 122 123 type FullInboxRetry struct { 124 globals.Contextified 125 utils.DebugLabeler 126 127 query *chat1.GetInboxLocalQuery 128 } 129 130 var _ types.RetryDescription = (*FullInboxRetry)(nil) 131 132 func NewFullInboxRetry(g *globals.Context, query *chat1.GetInboxLocalQuery) FullInboxRetry { 133 return FullInboxRetry{ 134 Contextified: globals.NewContextified(g), 135 DebugLabeler: utils.NewDebugLabeler(g.ExternalG(), "FullInboxRetry", false), 136 query: query, 137 } 138 } 139 140 func (f FullInboxRetry) String() string { 141 qstr := "<empty>" 142 if f.query != nil { 143 mh := codec.MsgpackHandle{WriteExt: true} 144 var data []byte 145 enc := codec.NewEncoderBytes(&data, &mh) 146 err := enc.Encode(*f.query) 147 if err != nil { 148 panic(err) 149 } 150 qstr = hex.EncodeToString(data) 151 } 152 pstr := "<empty>" 153 return qstr + pstr 154 } 155 156 func (f FullInboxRetry) RekeyFixable(ctx context.Context, tlfID chat1.TLFID) bool { 157 return false 158 } 159 160 func (f FullInboxRetry) SendStale(ctx context.Context, uid gregor1.UID) { 161 f.G().Syncer.SendChatStaleNotifications(ctx, uid, nil, true) 162 } 163 164 func (f FullInboxRetry) Fix(ctx context.Context, uid gregor1.UID) error { 165 query, _, err := f.G().InboxSource.GetInboxQueryLocalToRemote(ctx, f.query) 166 if err != nil { 167 f.Debug(ctx, "Fix: failed to convert query: %s", err.Error()) 168 return err 169 } 170 _, err = f.G().InboxSource.ReadUnverified(ctx, uid, types.InboxSourceDataSourceAll, query) 171 if err != nil { 172 f.Debug(ctx, "Fix: failed to load again: %d", err.Error()) 173 } 174 return nil 175 } 176 177 type retrierControl struct { 178 desc types.RetryDescription 179 forceCh chan struct{} 180 shutdownCh chan struct{} 181 } 182 183 func newRetrierControl(desc types.RetryDescription) *retrierControl { 184 return &retrierControl{ 185 desc: desc, 186 forceCh: make(chan struct{}, 1), 187 shutdownCh: make(chan struct{}, 1), 188 } 189 } 190 191 func (c *retrierControl) Shutdown() { 192 select { 193 case c.shutdownCh <- struct{}{}: 194 default: 195 } 196 } 197 198 func (c *retrierControl) Force() { 199 select { 200 case c.forceCh <- struct{}{}: 201 default: 202 } 203 } 204 205 // FetchRetrier is responsible for tracking any nonblock fetch failures, and retrying 206 // them automatically. 207 type FetchRetrier struct { 208 globals.Contextified 209 utils.DebugLabeler 210 sync.Mutex 211 212 retriers map[string]*retrierControl 213 clock clockwork.Clock 214 offline, running bool 215 } 216 217 var _ types.FetchRetrier = (*FetchRetrier)(nil) 218 219 func NewFetchRetrier(g *globals.Context) *FetchRetrier { 220 f := &FetchRetrier{ 221 Contextified: globals.NewContextified(g), 222 DebugLabeler: utils.NewDebugLabeler(g.ExternalG(), "FetchRetrier", false), 223 clock: clockwork.NewRealClock(), 224 retriers: make(map[string]*retrierControl), 225 } 226 return f 227 } 228 229 // SetClock sets a custom clock for testing. 230 func (f *FetchRetrier) SetClock(clock clockwork.Clock) { 231 f.clock = clock 232 } 233 234 func (f *FetchRetrier) key(uid gregor1.UID, desc types.RetryDescription) string { 235 return fmt.Sprintf("%s:%s", uid, desc) 236 } 237 238 // nextAttemptTime calculates the next try for a given retry item. It uses an exponential 239 // decay calculation. 240 func (f *FetchRetrier) nextAttemptTime(attempts int, lastAttempt time.Time) time.Time { 241 wait := time.Duration(float64(attempts) * fetchMultiplier * float64(fetchInitialInterval)) 242 return lastAttempt.Add(wait) 243 } 244 245 func (f *FetchRetrier) spawnRetrier(ctx context.Context, uid gregor1.UID, desc types.RetryDescription, 246 control *retrierControl) { 247 248 attempts := 1 249 nextTime := f.nextAttemptTime(attempts, f.clock.Now()) 250 ctx = globals.BackgroundChatCtx(ctx, f.G()) 251 go func() { 252 for { 253 select { 254 case <-f.clock.AfterTime(nextTime): 255 // Only attempts if we are online. Otherwise just retry 256 // at the same interval that we used last time. 257 if !f.offline { 258 f.Debug(ctx, "spawnRetrier: retrying after time: desc: %s", desc) 259 if err := desc.Fix(ctx, uid); err == nil { 260 f.Lock() 261 delete(f.retriers, f.key(uid, desc)) 262 f.Unlock() 263 desc.SendStale(ctx, uid) 264 return 265 } 266 } 267 case <-control.forceCh: 268 f.Debug(ctx, "spawnRetrier: retrying (forced): desc: %s", desc) 269 if err := desc.Fix(ctx, uid); err == nil { 270 f.Lock() 271 delete(f.retriers, f.key(uid, desc)) 272 f.Unlock() 273 desc.SendStale(ctx, uid) 274 return 275 } 276 case <-control.shutdownCh: 277 f.Lock() 278 defer f.Unlock() 279 f.Debug(ctx, "spawnRetrier: shutdown received, going down: desc: %s", desc) 280 delete(f.retriers, f.key(uid, desc)) 281 return 282 } 283 284 attempts++ 285 if attempts > fetchMaxAttempts { 286 f.Debug(ctx, "spawnRetrier: max attempts reached, bailing: desc: %s", desc) 287 control.Shutdown() 288 } 289 nextTime = f.nextAttemptTime(attempts, f.clock.Now()) 290 f.Debug(ctx, "spawnRetrier: attempts: %d next: %v desc: %s", attempts, nextTime, desc) 291 } 292 }() 293 } 294 295 // Failure indicates a failure of type kind has happened when loading a conversation. 296 func (f *FetchRetrier) Failure(ctx context.Context, uid gregor1.UID, desc types.RetryDescription) { 297 defer f.Trace(ctx, nil, fmt.Sprintf("Failure(%s)", desc))() 298 f.Lock() 299 defer f.Unlock() 300 if !f.running { 301 f.Debug(ctx, "Failure: not starting new retrier, not running") 302 return 303 } 304 key := f.key(uid, desc) 305 if _, ok := f.retriers[key]; !ok { 306 f.Debug(ctx, "Failure: spawning new retrier: desc: %s", desc) 307 control := newRetrierControl(desc) 308 f.retriers[key] = control 309 f.spawnRetrier(ctx, uid, desc, control) 310 } 311 } 312 313 // Success indicates a success of type kind loading a conversation. This effectively removes 314 // that conversation from the retry queue. 315 func (f *FetchRetrier) Success(ctx context.Context, uid gregor1.UID, desc types.RetryDescription) { 316 defer f.Trace(ctx, nil, fmt.Sprintf("Success(%s)", desc))() 317 f.Lock() 318 defer f.Unlock() 319 key := f.key(uid, desc) 320 if control, ok := f.retriers[key]; ok { 321 control.Shutdown() 322 } 323 } 324 325 // Connected is called when a connection to the chat server is established, and forces a 326 // pass over the retry queue 327 func (f *FetchRetrier) Connected(ctx context.Context) { 328 defer f.Trace(ctx, nil, "Connected")() 329 f.Lock() 330 defer f.Unlock() 331 f.offline = false 332 for _, control := range f.retriers { 333 control.Force() 334 } 335 } 336 337 // Disconnected is called when we lose connection to the chat server, and pauses attempts 338 // on the retry queue. 339 func (f *FetchRetrier) Disconnected(ctx context.Context) { 340 f.Lock() 341 defer f.Unlock() 342 f.offline = true 343 } 344 345 // IsOffline returns if the module thinks we are connected to the chat server. 346 func (f *FetchRetrier) IsOffline(ctx context.Context) bool { 347 f.Lock() 348 defer f.Unlock() 349 return f.offline 350 } 351 352 // Force forces a run of the retry loop. 353 func (f *FetchRetrier) Force(ctx context.Context) { 354 defer f.Trace(ctx, nil, "Force")() 355 f.Lock() 356 defer f.Unlock() 357 for _, control := range f.retriers { 358 control.Force() 359 } 360 } 361 362 func (f *FetchRetrier) Rekey(ctx context.Context, name string, membersType chat1.ConversationMembersType, 363 public bool) { 364 nameInfo, err := CreateNameInfoSource(ctx, f.G(), membersType).LookupID(ctx, name, public) 365 if err != nil { 366 f.Debug(ctx, "Rekey: failed to load name info for: %s msg %s", name, err) 367 return 368 } 369 var forces []*retrierControl 370 f.Lock() 371 for _, control := range f.retriers { 372 if control.desc.RekeyFixable(ctx, nameInfo.ID) { 373 forces = append(forces, control) 374 } 375 } 376 f.Unlock() 377 for _, force := range forces { 378 f.Debug(ctx, "Rekey: forcing: %s", force.desc) 379 force.Force() 380 } 381 } 382 383 func (f *FetchRetrier) Stop(ctx context.Context) chan struct{} { 384 defer f.Trace(ctx, nil, "Shutdown")() 385 f.Lock() 386 defer f.Unlock() 387 f.running = false 388 for _, control := range f.retriers { 389 control.Shutdown() 390 } 391 ch := make(chan struct{}) 392 close(ch) 393 return ch 394 } 395 396 func (f *FetchRetrier) Start(ctx context.Context, uid gregor1.UID) { 397 f.Lock() 398 defer f.Unlock() 399 f.running = true 400 }