github.com/decred/dcrlnd@v0.7.6/chanfitness/chaneventstore.go (about) 1 // Package chanfitness monitors the behaviour of channels to provide insight 2 // into the health and performance of a channel. This is achieved by maintaining 3 // an event store which tracks events for each channel. 4 // 5 // Lifespan: the period that the channel has been known to the scoring system. 6 // Note that lifespan may not equal the channel's full lifetime because data is 7 // not currently persisted. 8 // 9 // Uptime: the total time within a given period that the channel's remote peer 10 // has been online. 11 package chanfitness 12 13 import ( 14 "errors" 15 "sync" 16 "time" 17 18 "github.com/decred/dcrd/wire" 19 "github.com/decred/dcrlnd/channeldb" 20 "github.com/decred/dcrlnd/channelnotifier" 21 "github.com/decred/dcrlnd/clock" 22 "github.com/decred/dcrlnd/peernotifier" 23 "github.com/decred/dcrlnd/routing/route" 24 "github.com/decred/dcrlnd/subscribe" 25 "github.com/decred/dcrlnd/ticker" 26 ) 27 28 const ( 29 // FlapCountFlushRate determines how often we write peer total flap 30 // count to disk. 31 FlapCountFlushRate = time.Hour 32 ) 33 34 var ( 35 // errShuttingDown is returned when the store cannot respond to a query 36 // because it has received the shutdown signal. 37 errShuttingDown = errors.New("channel event store shutting down") 38 39 // ErrChannelNotFound is returned when a query is made for a channel 40 // that the event store does not have knowledge of. 41 ErrChannelNotFound = errors.New("channel not found in event store") 42 43 // ErrPeerNotFound is returned when a query is made for a channel 44 // that has a peer that the event store is not currently tracking. 45 ErrPeerNotFound = errors.New("peer not found in event store") 46 ) 47 48 // ChannelEventStore maintains a set of event logs for the node's channels to 49 // provide insight into the performance and health of channels. 50 type ChannelEventStore struct { 51 cfg *Config 52 53 // peers tracks all of our currently monitored peers and their channels. 54 peers map[route.Vertex]peerMonitor 55 56 // chanInfoRequests serves requests for information about our channel. 57 chanInfoRequests chan channelInfoRequest 58 59 // peerRequests serves requests for information about a peer. 60 peerRequests chan peerRequest 61 62 quit chan struct{} 63 64 wg sync.WaitGroup 65 } 66 67 // Config provides the event store with functions required to monitor channel 68 // activity. All elements of the config must be non-nil for the event store to 69 // operate. 70 type Config struct { 71 // SubscribeChannelEvents provides a subscription client which provides 72 // a stream of channel events. 73 SubscribeChannelEvents func() (subscribe.Subscription, error) 74 75 // SubscribePeerEvents provides a subscription client which provides a 76 // stream of peer online/offline events. 77 SubscribePeerEvents func() (subscribe.Subscription, error) 78 79 // GetOpenChannels provides a list of existing open channels which is 80 // used to populate the ChannelEventStore with a set of channels on 81 // startup. 82 GetOpenChannels func() ([]*channeldb.OpenChannel, error) 83 84 // Clock is the time source that the subsystem uses, provided here 85 // for ease of testing. 86 Clock clock.Clock 87 88 // WriteFlapCounts records the flap count for a set of peers on disk. 89 WriteFlapCount func(map[route.Vertex]*channeldb.FlapCount) error 90 91 // ReadFlapCount gets the flap count for a peer on disk. 92 ReadFlapCount func(route.Vertex) (*channeldb.FlapCount, error) 93 94 // FlapCountTicker is a ticker which controls how often we flush our 95 // peer's flap count to disk. 96 FlapCountTicker ticker.Ticker 97 } 98 99 // peerFlapCountMap is the map used to map peers to flap counts, declared here 100 // to allow shorter function signatures. 101 type peerFlapCountMap map[route.Vertex]*channeldb.FlapCount 102 103 type channelInfoRequest struct { 104 peer route.Vertex 105 channelPoint wire.OutPoint 106 responseChan chan channelInfoResponse 107 } 108 109 type channelInfoResponse struct { 110 info *ChannelInfo 111 err error 112 } 113 114 type peerRequest struct { 115 peer route.Vertex 116 responseChan chan peerResponse 117 } 118 119 type peerResponse struct { 120 flapCount int 121 ts *time.Time 122 err error 123 } 124 125 // NewChannelEventStore initializes an event store with the config provided. 126 // Note that this function does not start the main event loop, Start() must be 127 // called. 128 func NewChannelEventStore(config *Config) *ChannelEventStore { 129 store := &ChannelEventStore{ 130 cfg: config, 131 peers: make(map[route.Vertex]peerMonitor), 132 chanInfoRequests: make(chan channelInfoRequest), 133 peerRequests: make(chan peerRequest), 134 quit: make(chan struct{}), 135 } 136 137 return store 138 } 139 140 // Start adds all existing open channels to the event store and starts the main 141 // loop which records channel and peer events, and serves requests for 142 // information from the store. If this function fails, it cancels its existing 143 // subscriptions and returns an error. 144 func (c *ChannelEventStore) Start() error { 145 // Create a subscription to channel events. 146 channelClient, err := c.cfg.SubscribeChannelEvents() 147 if err != nil { 148 return err 149 } 150 151 // Create a subscription to peer events. If an error occurs, cancel the 152 // existing subscription to channel events and return. 153 peerClient, err := c.cfg.SubscribePeerEvents() 154 if err != nil { 155 channelClient.Cancel() 156 return err 157 } 158 159 // cancel should be called to cancel all subscriptions if an error 160 // occurs. 161 cancel := func() { 162 channelClient.Cancel() 163 peerClient.Cancel() 164 } 165 166 // Add the existing set of channels to the event store. This is required 167 // because channel events will not be triggered for channels that exist 168 // at startup time. 169 channels, err := c.cfg.GetOpenChannels() 170 if err != nil { 171 cancel() 172 return err 173 } 174 175 log.Infof("Adding %v channels to event store", len(channels)) 176 177 for _, ch := range channels { 178 peerKey, err := route.NewVertexFromBytes( 179 ch.IdentityPub.SerializeCompressed(), 180 ) 181 if err != nil { 182 cancel() 183 return err 184 } 185 186 // Add existing channels to the channel store with an initial 187 // peer online or offline event. 188 c.addChannel(ch.FundingOutpoint, peerKey) 189 } 190 191 // Start a goroutine that consumes events from all subscriptions. 192 c.wg.Add(1) 193 go c.consume(&subscriptions{ 194 channelUpdates: channelClient.Updates(), 195 peerUpdates: peerClient.Updates(), 196 cancel: cancel, 197 }) 198 199 return nil 200 } 201 202 // Stop terminates all goroutines started by the event store. 203 func (c *ChannelEventStore) Stop() { 204 log.Info("Stopping event store") 205 206 // Stop the consume goroutine. 207 close(c.quit) 208 c.wg.Wait() 209 210 // Stop the ticker after the goroutine reading from it has exited, to 211 // avoid a race. 212 c.cfg.FlapCountTicker.Stop() 213 } 214 215 // addChannel checks whether we are already tracking a channel's peer, creates a 216 // new peer log to track it if we are not yet monitoring it, and adds the 217 // channel. 218 func (c *ChannelEventStore) addChannel(channelPoint wire.OutPoint, 219 peer route.Vertex) { 220 221 peerMonitor, err := c.getPeerMonitor(peer) 222 if err != nil { 223 log.Error("could not create monitor: %v", err) 224 return 225 } 226 227 if err := peerMonitor.addChannel(channelPoint); err != nil { 228 log.Errorf("could not add channel: %v", err) 229 } 230 } 231 232 // getPeerMonitor tries to get an existing peer monitor from our in memory list, 233 // and falls back to creating a new monitor if it is not currently known. 234 func (c *ChannelEventStore) getPeerMonitor(peer route.Vertex) (peerMonitor, 235 error) { 236 237 peerMonitor, ok := c.peers[peer] 238 if ok { 239 return peerMonitor, nil 240 } 241 242 var ( 243 flapCount int 244 lastFlap *time.Time 245 ) 246 247 historicalFlap, err := c.cfg.ReadFlapCount(peer) 248 switch err { 249 // If we do not have any records for this peer we set a 0 flap count 250 // and timestamp. 251 case channeldb.ErrNoPeerBucket: 252 253 case nil: 254 flapCount = int(historicalFlap.Count) 255 lastFlap = &historicalFlap.LastFlap 256 257 // Return if we get an unexpected error. 258 default: 259 return nil, err 260 } 261 262 peerMonitor = newPeerLog(c.cfg.Clock, flapCount, lastFlap) 263 c.peers[peer] = peerMonitor 264 265 return peerMonitor, nil 266 } 267 268 // closeChannel records a closed time for a channel, and returns early is the 269 // channel is not known to the event store. We log warnings (rather than errors) 270 // when we cannot find a peer/channel because channels that we restore from a 271 // static channel backup do not have their open notified, so the event store 272 // never learns about them, but they are closed using the regular flow so we 273 // will try to remove them on close. At present, we cannot easily distinguish 274 // between these closes and others. 275 func (c *ChannelEventStore) closeChannel(channelPoint wire.OutPoint, 276 peer route.Vertex) { 277 278 peerMonitor, ok := c.peers[peer] 279 if !ok { 280 log.Warnf("peer not known to store: %v", peer) 281 return 282 } 283 284 if err := peerMonitor.removeChannel(channelPoint); err != nil { 285 log.Warnf("could not remove channel: %v", err) 286 } 287 } 288 289 // peerEvent creates a peer monitor for a peer if we do not currently have 290 // one, and adds an online event to it. 291 func (c *ChannelEventStore) peerEvent(peer route.Vertex, online bool) { 292 peerMonitor, err := c.getPeerMonitor(peer) 293 if err != nil { 294 log.Error("could not create monitor: %v", err) 295 return 296 } 297 298 peerMonitor.onlineEvent(online) 299 } 300 301 // subscriptions abstracts away from subscription clients to allow for mocking. 302 type subscriptions struct { 303 channelUpdates <-chan interface{} 304 peerUpdates <-chan interface{} 305 cancel func() 306 } 307 308 // consume is the event store's main loop. It consumes subscriptions to update 309 // the event store with channel and peer events, and serves requests for channel 310 // uptime and lifespan. 311 func (c *ChannelEventStore) consume(subscriptions *subscriptions) { 312 // Start our flap count ticker. 313 c.cfg.FlapCountTicker.Resume() 314 315 // On exit, we will cancel our subscriptions and write our most recent 316 // flap counts to disk. This ensures that we have consistent data in 317 // the case of a graceful shutdown. If we do not shutdown gracefully, 318 // our worst case is data from our last flap count tick (1H). 319 defer func() { 320 subscriptions.cancel() 321 322 if err := c.recordFlapCount(); err != nil { 323 log.Errorf("error recording flap on shutdown: %v", err) 324 } 325 326 c.wg.Done() 327 }() 328 329 // Consume events until the channel is closed. 330 for { 331 select { 332 // Process channel opened and closed events. 333 case e := <-subscriptions.channelUpdates: 334 switch event := e.(type) { 335 // A new channel has been opened, we must add the 336 // channel to the store and record a channel open event. 337 case channelnotifier.OpenChannelEvent: 338 compressed := event.Channel.IdentityPub.SerializeCompressed() 339 peerKey, err := route.NewVertexFromBytes( 340 compressed, 341 ) 342 if err != nil { 343 log.Errorf("Could not get vertex "+ 344 "from: %v", compressed) 345 } 346 347 c.addChannel( 348 event.Channel.FundingOutpoint, peerKey, 349 ) 350 351 // A channel has been closed, we must remove the channel 352 // from the store and record a channel closed event. 353 case channelnotifier.ClosedChannelEvent: 354 compressed := event.CloseSummary.RemotePub.SerializeCompressed() 355 peerKey, err := route.NewVertexFromBytes( 356 compressed, 357 ) 358 if err != nil { 359 log.Errorf("Could not get vertex "+ 360 "from: %v", compressed) 361 continue 362 } 363 364 c.closeChannel( 365 event.CloseSummary.ChanPoint, peerKey, 366 ) 367 } 368 369 // Process peer online and offline events. 370 case e := <-subscriptions.peerUpdates: 371 switch event := e.(type) { 372 // We have reestablished a connection with our peer, 373 // and should record an online event for any channels 374 // with that peer. 375 case peernotifier.PeerOnlineEvent: 376 c.peerEvent(event.PubKey, true) 377 378 // We have lost a connection with our peer, and should 379 // record an offline event for any channels with that 380 // peer. 381 case peernotifier.PeerOfflineEvent: 382 c.peerEvent(event.PubKey, false) 383 } 384 385 // Serve all requests for channel lifetime. 386 case req := <-c.chanInfoRequests: 387 var resp channelInfoResponse 388 389 resp.info, resp.err = c.getChanInfo(req) 390 req.responseChan <- resp 391 392 // Serve all requests for information about our peer. 393 case req := <-c.peerRequests: 394 var resp peerResponse 395 396 resp.flapCount, resp.ts, resp.err = c.flapCount( 397 req.peer, 398 ) 399 req.responseChan <- resp 400 401 case <-c.cfg.FlapCountTicker.Ticks(): 402 if err := c.recordFlapCount(); err != nil { 403 log.Errorf("could not record flap "+ 404 "count: %v", err) 405 } 406 407 // Exit if the store receives the signal to shutdown. 408 case <-c.quit: 409 return 410 } 411 } 412 } 413 414 // ChannelInfo provides the set of information that the event store has recorded 415 // for a channel. 416 type ChannelInfo struct { 417 // Lifetime is the total amount of time we have monitored the channel 418 // for. 419 Lifetime time.Duration 420 421 // Uptime is the total amount of time that the channel peer has been 422 // observed as online during the monitored lifespan. 423 Uptime time.Duration 424 } 425 426 // GetChanInfo gets all the information we have on a channel in the event store. 427 func (c *ChannelEventStore) GetChanInfo(channelPoint wire.OutPoint, 428 peer route.Vertex) (*ChannelInfo, error) { 429 430 request := channelInfoRequest{ 431 peer: peer, 432 channelPoint: channelPoint, 433 responseChan: make(chan channelInfoResponse), 434 } 435 436 // Send a request for the channel's information to the main event loop, 437 // or return early with an error if the store has already received a 438 // shutdown signal. 439 select { 440 case c.chanInfoRequests <- request: 441 case <-c.quit: 442 return nil, errShuttingDown 443 } 444 445 // Return the response we receive on the response channel or exit early 446 // if the store is instructed to exit. 447 select { 448 case resp := <-request.responseChan: 449 return resp.info, resp.err 450 451 case <-c.quit: 452 return nil, errShuttingDown 453 } 454 } 455 456 // getChanInfo collects channel information for a channel. It gets uptime over 457 // the full lifetime of the channel. 458 func (c *ChannelEventStore) getChanInfo(req channelInfoRequest) (*ChannelInfo, 459 error) { 460 461 peerMonitor, ok := c.peers[req.peer] 462 if !ok { 463 return nil, ErrPeerNotFound 464 } 465 466 lifetime, uptime, err := peerMonitor.channelUptime(req.channelPoint) 467 if err != nil { 468 return nil, err 469 } 470 471 return &ChannelInfo{ 472 Lifetime: lifetime, 473 Uptime: uptime, 474 }, nil 475 } 476 477 // FlapCount returns the flap count we have for a peer and the timestamp of its 478 // last flap. If we do not have any flaps recorded for the peer, the last flap 479 // timestamp will be nil. 480 func (c *ChannelEventStore) FlapCount(peer route.Vertex) (int, *time.Time, 481 error) { 482 483 request := peerRequest{ 484 peer: peer, 485 responseChan: make(chan peerResponse), 486 } 487 488 // Send a request for the peer's information to the main event loop, 489 // or return early with an error if the store has already received a 490 // shutdown signal. 491 select { 492 case c.peerRequests <- request: 493 case <-c.quit: 494 return 0, nil, errShuttingDown 495 } 496 497 // Return the response we receive on the response channel or exit early 498 // if the store is instructed to exit. 499 select { 500 case resp := <-request.responseChan: 501 return resp.flapCount, resp.ts, resp.err 502 503 case <-c.quit: 504 return 0, nil, errShuttingDown 505 } 506 } 507 508 // flapCount gets our peer flap count and last flap timestamp from our in memory 509 // record of a peer, falling back to on disk if we are not currently tracking 510 // the peer. If we have no flap count recorded for the peer, a nil last flap 511 // time will be returned. 512 func (c *ChannelEventStore) flapCount(peer route.Vertex) (int, *time.Time, 513 error) { 514 515 // First check whether we are tracking this peer in memory, because this 516 // record will have the most accurate flap count. We do not fail if we 517 // can't find the peer in memory, because we may have previously 518 // recorded its flap count on disk. 519 peerMonitor, ok := c.peers[peer] 520 if ok { 521 count, ts := peerMonitor.getFlapCount() 522 return count, ts, nil 523 } 524 525 // Try to get our flap count from the database. If this value is not 526 // recorded, we return a nil last flap time to indicate that we have no 527 // record of the peer's flap count. 528 flapCount, err := c.cfg.ReadFlapCount(peer) 529 switch err { 530 case channeldb.ErrNoPeerBucket: 531 return 0, nil, nil 532 533 case nil: 534 return int(flapCount.Count), &flapCount.LastFlap, nil 535 536 default: 537 return 0, nil, err 538 } 539 } 540 541 // recordFlapCount will record our flap count for each peer that we are 542 // currently tracking, skipping peers that have a 0 flap count. 543 func (c *ChannelEventStore) recordFlapCount() error { 544 updates := make(peerFlapCountMap) 545 546 for peer, monitor := range c.peers { 547 flapCount, lastFlap := monitor.getFlapCount() 548 if lastFlap == nil { 549 continue 550 } 551 552 updates[peer] = &channeldb.FlapCount{ 553 Count: uint32(flapCount), 554 LastFlap: *lastFlap, 555 } 556 } 557 558 log.Debugf("recording flap count for: %v peers", len(updates)) 559 560 return c.cfg.WriteFlapCount(updates) 561 }