github.com/decred/dcrlnd@v0.7.6/watchtower/wtclient/session_queue.go (about) 1 package wtclient 2 3 import ( 4 "container/list" 5 "fmt" 6 "sync" 7 "time" 8 9 "github.com/decred/dcrd/chaincfg/chainhash" 10 "github.com/decred/dcrlnd/input" 11 "github.com/decred/dcrlnd/keychain" 12 "github.com/decred/dcrlnd/lnwire" 13 "github.com/decred/dcrlnd/watchtower/wtdb" 14 "github.com/decred/dcrlnd/watchtower/wtserver" 15 "github.com/decred/dcrlnd/watchtower/wtwire" 16 "github.com/decred/slog" 17 ) 18 19 // reserveStatus is an enum that signals how full a particular session is. 20 type reserveStatus uint8 21 22 const ( 23 // reserveAvailable indicates that the session has space for at least 24 // one more backup. 25 reserveAvailable reserveStatus = iota 26 27 // reserveExhausted indicates that all slots in the session have been 28 // allocated. 29 reserveExhausted 30 ) 31 32 // sessionQueueConfig bundles the resources required by the sessionQueue to 33 // perform its duties. All entries MUST be non-nil. 34 type sessionQueueConfig struct { 35 // ClientSession provides access to the negotiated session parameters 36 // and updating its persistent storage. 37 ClientSession *wtdb.ClientSession 38 39 // ChainHash identifies the chain for which the session's justice 40 // transactions are targeted. 41 ChainHash chainhash.Hash 42 43 // Dial allows the client to dial the tower using it's public key and 44 // net address. 45 Dial func(keychain.SingleKeyECDH, *lnwire.NetAddress) (wtserver.Peer, 46 error) 47 48 // SendMessage encodes, encrypts, and writes a message to the given peer. 49 SendMessage func(wtserver.Peer, wtwire.Message) error 50 51 // ReadMessage receives, decypts, and decodes a message from the given 52 // peer. 53 ReadMessage func(wtserver.Peer) (wtwire.Message, error) 54 55 // Signer facilitates signing of inputs, used to construct the witnesses 56 // for justice transaction inputs. 57 Signer input.Signer 58 59 // DB provides access to the client's stable storage. 60 DB DB 61 62 // MinBackoff defines the initial backoff applied by the session 63 // queue before reconnecting to the tower after a failed or partially 64 // successful batch is sent. Subsequent backoff durations will grow 65 // exponentially up until MaxBackoff. 66 MinBackoff time.Duration 67 68 // MaxBackoff defines the maximum backoff applied by the session 69 // queue before reconnecting to the tower after a failed or partially 70 // successful batch is sent. If the exponential backoff produces a 71 // timeout greater than this value, the backoff duration will be clamped 72 // to MaxBackoff. 73 MaxBackoff time.Duration 74 75 // Log specifies the desired log output, which should be prefixed by the 76 // client type, e.g. anchor or legacy. 77 Log slog.Logger 78 } 79 80 // sessionQueue implements a reliable queue that will encrypt and send accepted 81 // backups to the watchtower specified in the config's ClientSession. Calling 82 // Quit will attempt to perform a clean shutdown by receiving an ACK from the 83 // tower for all pending backups before exiting. The clean shutdown can be 84 // aborted by using ForceQuit, which will attempt to shutdown the queue 85 // immediately. 86 type sessionQueue struct { 87 started sync.Once 88 stopped sync.Once 89 forced sync.Once 90 91 cfg *sessionQueueConfig 92 log slog.Logger 93 94 commitQueue *list.List 95 pendingQueue *list.List 96 queueMtx sync.Mutex 97 queueCond *sync.Cond 98 99 localInit *wtwire.Init 100 towerAddr *lnwire.NetAddress 101 102 seqNum uint16 103 104 retryBackoff time.Duration 105 106 quit chan struct{} 107 forceQuit chan struct{} 108 shutdown chan struct{} 109 } 110 111 // newSessionQueue intiializes a fresh sessionQueue. 112 func newSessionQueue(cfg *sessionQueueConfig) *sessionQueue { 113 localInit := wtwire.NewInitMessage( 114 lnwire.NewRawFeatureVector(wtwire.AltruistSessionsRequired), 115 cfg.ChainHash, 116 ) 117 118 towerAddr := &lnwire.NetAddress{ 119 IdentityKey: cfg.ClientSession.Tower.IdentityKey, 120 Address: cfg.ClientSession.Tower.Addresses[0], 121 } 122 123 sq := &sessionQueue{ 124 cfg: cfg, 125 log: cfg.Log, 126 commitQueue: list.New(), 127 pendingQueue: list.New(), 128 localInit: localInit, 129 towerAddr: towerAddr, 130 seqNum: cfg.ClientSession.SeqNum, 131 retryBackoff: cfg.MinBackoff, 132 quit: make(chan struct{}), 133 forceQuit: make(chan struct{}), 134 shutdown: make(chan struct{}), 135 } 136 sq.queueCond = sync.NewCond(&sq.queueMtx) 137 138 // The database should return them in sorted order, and session queue's 139 // sequence number will be equal to that of the last committed update. 140 for _, update := range sq.cfg.ClientSession.CommittedUpdates { 141 sq.commitQueue.PushBack(update) 142 } 143 144 return sq 145 } 146 147 // Start idempotently starts the sessionQueue so that it can begin accepting 148 // backups. 149 func (q *sessionQueue) Start() { 150 q.started.Do(func() { 151 go q.sessionManager() 152 }) 153 } 154 155 // Stop idempotently stops the sessionQueue by initiating a clean shutdown that 156 // will clear all pending tasks in the queue before returning to the caller. 157 func (q *sessionQueue) Stop() { 158 q.stopped.Do(func() { 159 q.log.Debugf("SessionQueue(%s) stopping ...", q.ID()) 160 161 close(q.quit) 162 q.signalUntilShutdown() 163 164 // Skip log if we also force quit. 165 select { 166 case <-q.forceQuit: 167 return 168 default: 169 } 170 171 q.log.Debugf("SessionQueue(%s) stopped", q.ID()) 172 }) 173 } 174 175 // ForceQuit idempotently aborts any clean shutdown in progress and returns to 176 // he caller after all lingering goroutines have spun down. 177 func (q *sessionQueue) ForceQuit() { 178 q.forced.Do(func() { 179 q.log.Infof("SessionQueue(%s) force quitting...", q.ID()) 180 181 close(q.forceQuit) 182 q.signalUntilShutdown() 183 184 q.log.Infof("SessionQueue(%s) force quit", q.ID()) 185 }) 186 } 187 188 // ID returns the wtdb.SessionID for the queue, which can be used to uniquely 189 // identify this a particular queue. 190 func (q *sessionQueue) ID() *wtdb.SessionID { 191 return &q.cfg.ClientSession.ID 192 } 193 194 // AcceptTask attempts to queue a backupTask for delivery to the sessionQueue's 195 // tower. The session will only be accepted if the queue is not already 196 // exhausted and the task is successfully bound to the ClientSession. 197 func (q *sessionQueue) AcceptTask(task *backupTask) (reserveStatus, bool) { 198 q.queueCond.L.Lock() 199 200 numPending := uint32(q.pendingQueue.Len()) 201 maxUpdates := q.cfg.ClientSession.Policy.MaxUpdates 202 q.log.Debugf("SessionQueue(%s) deciding to accept %v seqnum=%d "+ 203 "pending=%d max-updates=%d", 204 q.ID(), task.id, q.seqNum, numPending, maxUpdates) 205 206 // Examine the current reserve status of the session queue. 207 curStatus := q.reserveStatus() 208 209 switch curStatus { 210 211 // The session queue is exhausted, and cannot accept the task because it 212 // is full. Reject the task such that it can be tried against a 213 // different session. 214 case reserveExhausted: 215 q.queueCond.L.Unlock() 216 return curStatus, false 217 218 // The session queue is not exhausted. Compute the sweep and reward 219 // outputs as a function of the session parameters. If the outputs are 220 // dusty or uneconomical to backup, the task is rejected and will not be 221 // tried again. 222 // 223 // TODO(conner): queue backups and retry with different session params. 224 case reserveAvailable: 225 err := task.bindSession(&q.cfg.ClientSession.ClientSessionBody) 226 if err != nil { 227 q.queueCond.L.Unlock() 228 q.log.Debugf("SessionQueue(%s) rejected %v: %v ", 229 q.ID(), task.id, err) 230 return curStatus, false 231 } 232 } 233 234 // The sweep and reward outputs satisfy the session's policy, queue the 235 // task for final signing and delivery. 236 q.pendingQueue.PushBack(task) 237 238 // Finally, compute the session's *new* reserve status. This will be 239 // used by the client to determine if it can continue using this session 240 // queue, or if it should negotiate a new one. 241 newStatus := q.reserveStatus() 242 q.queueCond.L.Unlock() 243 244 q.queueCond.Signal() 245 246 return newStatus, true 247 } 248 249 // sessionManager is the primary event loop for the sessionQueue, and is 250 // responsible for encrypting and sending accepted tasks to the tower. 251 func (q *sessionQueue) sessionManager() { 252 defer close(q.shutdown) 253 254 for { 255 q.queueCond.L.Lock() 256 for q.commitQueue.Len() == 0 && 257 q.pendingQueue.Len() == 0 { 258 259 q.queueCond.Wait() 260 261 select { 262 case <-q.quit: 263 if q.commitQueue.Len() == 0 && 264 q.pendingQueue.Len() == 0 { 265 q.queueCond.L.Unlock() 266 return 267 } 268 case <-q.forceQuit: 269 q.queueCond.L.Unlock() 270 return 271 default: 272 } 273 } 274 q.queueCond.L.Unlock() 275 276 // Exit immediately if a force quit has been requested. If the 277 // either of the queues still has state updates to send to the 278 // tower, we may never exit in the above case if we are unable 279 // to reach the tower for some reason. 280 select { 281 case <-q.forceQuit: 282 return 283 default: 284 } 285 286 // Initiate a new connection to the watchtower and attempt to 287 // drain all pending tasks. 288 q.drainBackups() 289 } 290 } 291 292 // drainBackups attempts to send all pending updates in the queue to the tower. 293 func (q *sessionQueue) drainBackups() { 294 // First, check that we are able to dial this session's tower. 295 conn, err := q.cfg.Dial(q.cfg.ClientSession.SessionKeyECDH, q.towerAddr) 296 if err != nil { 297 q.log.Errorf("SessionQueue(%s) unable to dial tower at %v: %v", 298 q.ID(), q.towerAddr, err) 299 300 q.increaseBackoff() 301 select { 302 case <-time.After(q.retryBackoff): 303 case <-q.forceQuit: 304 } 305 return 306 } 307 defer conn.Close() 308 309 // Begin draining the queue of pending state updates. Before the first 310 // update is sent, we will precede it with an Init message. If the first 311 // is successful, subsequent updates can be streamed without sending an 312 // Init. 313 for sendInit := true; ; sendInit = false { 314 // Generate the next state update to upload to the tower. This 315 // method will first proceed in dequeueing committed updates 316 // before attempting to dequeue any pending updates. 317 stateUpdate, isPending, backupID, err := q.nextStateUpdate() 318 if err != nil { 319 q.log.Errorf("SessionQueue(%v) unable to get next state "+ 320 "update: %v", q.ID(), err) 321 return 322 } 323 324 // Now, send the state update to the tower and wait for a reply. 325 err = q.sendStateUpdate( 326 conn, stateUpdate, q.localInit, sendInit, isPending, 327 ) 328 if err != nil { 329 q.log.Errorf("SessionQueue(%s) unable to send state "+ 330 "update: %v", q.ID(), err) 331 332 q.increaseBackoff() 333 select { 334 case <-time.After(q.retryBackoff): 335 case <-q.forceQuit: 336 } 337 return 338 } 339 340 q.log.Infof("SessionQueue(%s) uploaded %v seqnum=%d", 341 q.ID(), backupID, stateUpdate.SeqNum) 342 343 // If the last task was backed up successfully, we'll exit and 344 // continue once more tasks are added to the queue. We'll also 345 // clear any accumulated backoff as this batch was able to be 346 // sent reliably. 347 if stateUpdate.IsComplete == 1 { 348 q.resetBackoff() 349 return 350 } 351 352 // Always apply a small delay between sends, which makes the 353 // unit tests more reliable. If we were requested to back off, 354 // when we will do so. 355 select { 356 case <-time.After(time.Millisecond): 357 case <-q.forceQuit: 358 return 359 } 360 } 361 } 362 363 // nextStateUpdate returns the next wtwire.StateUpdate to upload to the tower. 364 // If any committed updates are present, this method will reconstruct the state 365 // update from the committed update using the current last applied value found 366 // in the database. Otherwise, it will select the next pending update, craft the 367 // payload, and commit an update before returning the state update to send. The 368 // boolean value in the response is true if the state update is taken from the 369 // pending queue, allowing the caller to remove the update from either the 370 // commit or pending queue if the update is successfully acked. 371 func (q *sessionQueue) nextStateUpdate() (*wtwire.StateUpdate, bool, 372 wtdb.BackupID, error) { 373 374 var ( 375 seqNum uint16 376 update wtdb.CommittedUpdate 377 isLast bool 378 isPending bool 379 ) 380 381 q.queueCond.L.Lock() 382 switch { 383 384 // If the commit queue is non-empty, parse the next committed update. 385 case q.commitQueue.Len() > 0: 386 next := q.commitQueue.Front() 387 388 update = next.Value.(wtdb.CommittedUpdate) 389 seqNum = update.SeqNum 390 391 // If this is the last item in the commit queue and no items 392 // exist in the pending queue, we will use the IsComplete flag 393 // in the StateUpdate to signal that the tower can release the 394 // connection after replying to free up resources. 395 isLast = q.commitQueue.Len() == 1 && q.pendingQueue.Len() == 0 396 q.queueCond.L.Unlock() 397 398 q.log.Debugf("SessionQueue(%s) reprocessing committed state "+ 399 "update for %v seqnum=%d", 400 q.ID(), update.BackupID, seqNum) 401 402 // Otherwise, craft and commit the next update from the pending queue. 403 default: 404 isPending = true 405 406 // Determine the current sequence number to apply for this 407 // pending update. 408 seqNum = q.seqNum + 1 409 410 // Obtain the next task from the queue. 411 next := q.pendingQueue.Front() 412 task := next.Value.(*backupTask) 413 414 // If this is the last item in the pending queue, we will use 415 // the IsComplete flag in the StateUpdate to signal that the 416 // tower can release the connection after replying to free up 417 // resources. 418 isLast = q.pendingQueue.Len() == 1 419 q.queueCond.L.Unlock() 420 421 hint, encBlob, err := task.craftSessionPayload(q.cfg.Signer) 422 if err != nil { 423 // TODO(conner): mark will not send 424 err := fmt.Errorf("unable to craft session payload: %v", 425 err) 426 return nil, false, wtdb.BackupID{}, err 427 } 428 // TODO(conner): special case other obscure errors 429 430 update = wtdb.CommittedUpdate{ 431 SeqNum: seqNum, 432 CommittedUpdateBody: wtdb.CommittedUpdateBody{ 433 BackupID: task.id, 434 Hint: hint, 435 EncryptedBlob: encBlob, 436 }, 437 } 438 439 q.log.Debugf("SessionQueue(%s) committing state update "+ 440 "%v seqnum=%d", q.ID(), update.BackupID, seqNum) 441 } 442 443 // Before sending the task to the tower, commit the state update 444 // to disk using the assigned sequence number. If this task has already 445 // been committed, the call will succeed and only be used for the 446 // purpose of obtaining the last applied value to send to the tower. 447 // 448 // This step ensures that if we crash before receiving an ack that we 449 // will retransmit the same update. If the tower successfully received 450 // the update from before, it will reply with an ACK regardless of what 451 // we send the next time. This step ensures that if we reliably send the 452 // same update for a given sequence number, to prevent us from thinking 453 // we backed up a state when we instead backed up another. 454 lastApplied, err := q.cfg.DB.CommitUpdate(q.ID(), &update) 455 if err != nil { 456 // TODO(conner): mark failed/reschedule 457 err := fmt.Errorf("unable to commit state update for "+ 458 "%v seqnum=%d: %v", update.BackupID, seqNum, err) 459 return nil, false, wtdb.BackupID{}, err 460 } 461 462 stateUpdate := &wtwire.StateUpdate{ 463 SeqNum: update.SeqNum, 464 LastApplied: lastApplied, 465 Hint: update.Hint, 466 EncryptedBlob: update.EncryptedBlob, 467 } 468 469 // Set the IsComplete flag if this is the last queued item. 470 if isLast { 471 stateUpdate.IsComplete = 1 472 } 473 474 return stateUpdate, isPending, update.BackupID, nil 475 } 476 477 // sendStateUpdate sends a wtwire.StateUpdate to the watchtower and processes 478 // the ACK before returning. If sendInit is true, this method will first send 479 // the localInit message and verify that the tower supports our required feature 480 // bits. And error is returned if any part of the send fails. The boolean return 481 // variable indicates whether or not we should back off before attempting to 482 // send the next state update. 483 func (q *sessionQueue) sendStateUpdate(conn wtserver.Peer, 484 stateUpdate *wtwire.StateUpdate, localInit *wtwire.Init, 485 sendInit, isPending bool) error { 486 487 // If this is the first message being sent to the tower, we must send an 488 // Init message to establish that server supports the features we 489 // require. 490 if sendInit { 491 // Send Init to tower. 492 err := q.cfg.SendMessage(conn, q.localInit) 493 if err != nil { 494 return err 495 } 496 497 // Receive Init from tower. 498 remoteMsg, err := q.cfg.ReadMessage(conn) 499 if err != nil { 500 return err 501 } 502 503 remoteInit, ok := remoteMsg.(*wtwire.Init) 504 if !ok { 505 return fmt.Errorf("watchtower %s responded with %T "+ 506 "to Init", q.towerAddr, remoteMsg) 507 } 508 509 // Validate Init. 510 err = q.localInit.CheckRemoteInit( 511 remoteInit, wtwire.FeatureNames, 512 ) 513 if err != nil { 514 return err 515 } 516 } 517 518 // Send StateUpdate to tower. 519 err := q.cfg.SendMessage(conn, stateUpdate) 520 if err != nil { 521 return err 522 } 523 524 // Receive StateUpdate from tower. 525 remoteMsg, err := q.cfg.ReadMessage(conn) 526 if err != nil { 527 return err 528 } 529 530 stateUpdateReply, ok := remoteMsg.(*wtwire.StateUpdateReply) 531 if !ok { 532 return fmt.Errorf("watchtower %s responded with %T to "+ 533 "StateUpdate", q.towerAddr, remoteMsg) 534 } 535 536 // Process the reply from the tower. 537 switch stateUpdateReply.Code { 538 539 // The tower reported a successful update, validate the response and 540 // record the last applied returned. 541 case wtwire.CodeOK: 542 543 // TODO(conner): handle other error cases properly, ban towers, etc. 544 default: 545 err := fmt.Errorf("received error code %v in "+ 546 "StateUpdateReply for seqnum=%d", 547 stateUpdateReply.Code, stateUpdate.SeqNum) 548 q.log.Warnf("SessionQueue(%s) unable to upload state update to "+ 549 "tower=%s: %v", q.ID(), q.towerAddr, err) 550 return err 551 } 552 553 lastApplied := stateUpdateReply.LastApplied 554 err = q.cfg.DB.AckUpdate(q.ID(), stateUpdate.SeqNum, lastApplied) 555 switch { 556 case err == wtdb.ErrUnallocatedLastApplied: 557 // TODO(conner): borked watchtower 558 err = fmt.Errorf("unable to ack seqnum=%d: %v", 559 stateUpdate.SeqNum, err) 560 q.log.Errorf("SessionQueue(%v) failed to ack update: %v", q.ID(), err) 561 return err 562 563 case err == wtdb.ErrLastAppliedReversion: 564 // TODO(conner): borked watchtower 565 err = fmt.Errorf("unable to ack seqnum=%d: %v", 566 stateUpdate.SeqNum, err) 567 q.log.Errorf("SessionQueue(%s) failed to ack update: %v", 568 q.ID(), err) 569 return err 570 571 case err != nil: 572 err = fmt.Errorf("unable to ack seqnum=%d: %v", 573 stateUpdate.SeqNum, err) 574 q.log.Errorf("SessionQueue(%s) failed to ack update: %v", 575 q.ID(), err) 576 return err 577 } 578 579 q.queueCond.L.Lock() 580 if isPending { 581 // If a pending update was successfully sent, increment the 582 // sequence number and remove the item from the queue. This 583 // ensures the total number of backups in the session remains 584 // unchanged, which maintains the external view of the session's 585 // reserve status. 586 q.seqNum++ 587 q.pendingQueue.Remove(q.pendingQueue.Front()) 588 } else { 589 // Otherwise, simply remove the update from the committed queue. 590 // This has no effect on the queues reserve status since the 591 // update had already been committed. 592 q.commitQueue.Remove(q.commitQueue.Front()) 593 } 594 q.queueCond.L.Unlock() 595 596 return nil 597 } 598 599 // reserveStatus returns a reserveStatus indicating whether or not the 600 // sessionQueue can accept another task. reserveAvailable is returned when a 601 // task can be accepted, and reserveExhausted is returned if the all slots in 602 // the session have been allocated. 603 // 604 // NOTE: This method MUST be called with queueCond's exclusive lock held. 605 func (q *sessionQueue) reserveStatus() reserveStatus { 606 numPending := uint32(q.pendingQueue.Len()) 607 maxUpdates := uint32(q.cfg.ClientSession.Policy.MaxUpdates) 608 609 if uint32(q.seqNum)+numPending < maxUpdates { 610 return reserveAvailable 611 } 612 613 return reserveExhausted 614 615 } 616 617 // resetBackoff returns the connection backoff the minimum configured backoff. 618 func (q *sessionQueue) resetBackoff() { 619 q.retryBackoff = q.cfg.MinBackoff 620 } 621 622 // increaseBackoff doubles the current connection backoff, clamping to the 623 // configured maximum backoff if it would exceed the limit. 624 func (q *sessionQueue) increaseBackoff() { 625 q.retryBackoff *= 2 626 if q.retryBackoff > q.cfg.MaxBackoff { 627 q.retryBackoff = q.cfg.MaxBackoff 628 } 629 } 630 631 // signalUntilShutdown strobes the sessionQueue's condition variable until the 632 // main event loop exits. 633 func (q *sessionQueue) signalUntilShutdown() { 634 for { 635 select { 636 case <-time.After(time.Millisecond): 637 q.queueCond.Signal() 638 case <-q.shutdown: 639 return 640 } 641 } 642 } 643 644 // sessionQueueSet maintains a mapping of SessionIDs to their corresponding 645 // sessionQueue. 646 type sessionQueueSet map[wtdb.SessionID]*sessionQueue 647 648 // Add inserts a sessionQueue into the sessionQueueSet. 649 func (s *sessionQueueSet) Add(sessionQueue *sessionQueue) { 650 (*s)[*sessionQueue.ID()] = sessionQueue 651 } 652 653 // ApplyAndWait executes the nil-adic function returned from getApply for each 654 // sessionQueue in the set in parallel, then waits for all of them to finish 655 // before returning to the caller. 656 func (s *sessionQueueSet) ApplyAndWait(getApply func(*sessionQueue) func()) { 657 var wg sync.WaitGroup 658 for _, sessionq := range *s { 659 wg.Add(1) 660 go func(sq *sessionQueue) { 661 defer wg.Done() 662 getApply(sq)() 663 }(sessionq) 664 } 665 wg.Wait() 666 }