github.com/decred/dcrlnd@v0.7.6/chanfitness/chaneventstore.go (about)

     1  // Package chanfitness monitors the behaviour of channels to provide insight
     2  // into the health and performance of a channel. This is achieved by maintaining
     3  // an event store which tracks events for each channel.
     4  //
     5  // Lifespan: the period that the channel has been known to the scoring system.
     6  // Note that lifespan may not equal the channel's full lifetime because data is
     7  // not currently persisted.
     8  //
     9  // Uptime: the total time within a given period that the channel's remote peer
    10  // has been online.
    11  package chanfitness
    12  
    13  import (
    14  	"errors"
    15  	"sync"
    16  	"time"
    17  
    18  	"github.com/decred/dcrd/wire"
    19  	"github.com/decred/dcrlnd/channeldb"
    20  	"github.com/decred/dcrlnd/channelnotifier"
    21  	"github.com/decred/dcrlnd/clock"
    22  	"github.com/decred/dcrlnd/peernotifier"
    23  	"github.com/decred/dcrlnd/routing/route"
    24  	"github.com/decred/dcrlnd/subscribe"
    25  	"github.com/decred/dcrlnd/ticker"
    26  )
    27  
    28  const (
    29  	// FlapCountFlushRate determines how often we write peer total flap
    30  	// count to disk.
    31  	FlapCountFlushRate = time.Hour
    32  )
    33  
    34  var (
    35  	// errShuttingDown is returned when the store cannot respond to a query
    36  	// because it has received the shutdown signal.
    37  	errShuttingDown = errors.New("channel event store shutting down")
    38  
    39  	// ErrChannelNotFound is returned when a query is made for a channel
    40  	// that the event store does not have knowledge of.
    41  	ErrChannelNotFound = errors.New("channel not found in event store")
    42  
    43  	// ErrPeerNotFound is returned when a query is made for a channel
    44  	// that has a peer that the event store is not currently tracking.
    45  	ErrPeerNotFound = errors.New("peer not found in event store")
    46  )
    47  
    48  // ChannelEventStore maintains a set of event logs for the node's channels to
    49  // provide insight into the performance and health of channels.
    50  type ChannelEventStore struct {
    51  	cfg *Config
    52  
    53  	// peers tracks all of our currently monitored peers and their channels.
    54  	peers map[route.Vertex]peerMonitor
    55  
    56  	// chanInfoRequests serves requests for information about our channel.
    57  	chanInfoRequests chan channelInfoRequest
    58  
    59  	// peerRequests serves requests for information about a peer.
    60  	peerRequests chan peerRequest
    61  
    62  	quit chan struct{}
    63  
    64  	wg sync.WaitGroup
    65  }
    66  
    67  // Config provides the event store with functions required to monitor channel
    68  // activity. All elements of the config must be non-nil for the event store to
    69  // operate.
    70  type Config struct {
    71  	// SubscribeChannelEvents provides a subscription client which provides
    72  	// a stream of channel events.
    73  	SubscribeChannelEvents func() (subscribe.Subscription, error)
    74  
    75  	// SubscribePeerEvents provides a subscription client which provides a
    76  	// stream of peer online/offline events.
    77  	SubscribePeerEvents func() (subscribe.Subscription, error)
    78  
    79  	// GetOpenChannels provides a list of existing open channels which is
    80  	// used to populate the ChannelEventStore with a set of channels on
    81  	// startup.
    82  	GetOpenChannels func() ([]*channeldb.OpenChannel, error)
    83  
    84  	// Clock is the time source that the subsystem uses, provided here
    85  	// for ease of testing.
    86  	Clock clock.Clock
    87  
    88  	// WriteFlapCounts records the flap count for a set of peers on disk.
    89  	WriteFlapCount func(map[route.Vertex]*channeldb.FlapCount) error
    90  
    91  	// ReadFlapCount gets the flap count for a peer on disk.
    92  	ReadFlapCount func(route.Vertex) (*channeldb.FlapCount, error)
    93  
    94  	// FlapCountTicker is a ticker which controls how often we flush our
    95  	// peer's flap count to disk.
    96  	FlapCountTicker ticker.Ticker
    97  }
    98  
    99  // peerFlapCountMap is the map used to map peers to flap counts, declared here
   100  // to allow shorter function signatures.
   101  type peerFlapCountMap map[route.Vertex]*channeldb.FlapCount
   102  
   103  type channelInfoRequest struct {
   104  	peer         route.Vertex
   105  	channelPoint wire.OutPoint
   106  	responseChan chan channelInfoResponse
   107  }
   108  
   109  type channelInfoResponse struct {
   110  	info *ChannelInfo
   111  	err  error
   112  }
   113  
   114  type peerRequest struct {
   115  	peer         route.Vertex
   116  	responseChan chan peerResponse
   117  }
   118  
   119  type peerResponse struct {
   120  	flapCount int
   121  	ts        *time.Time
   122  	err       error
   123  }
   124  
   125  // NewChannelEventStore initializes an event store with the config provided.
   126  // Note that this function does not start the main event loop, Start() must be
   127  // called.
   128  func NewChannelEventStore(config *Config) *ChannelEventStore {
   129  	store := &ChannelEventStore{
   130  		cfg:              config,
   131  		peers:            make(map[route.Vertex]peerMonitor),
   132  		chanInfoRequests: make(chan channelInfoRequest),
   133  		peerRequests:     make(chan peerRequest),
   134  		quit:             make(chan struct{}),
   135  	}
   136  
   137  	return store
   138  }
   139  
   140  // Start adds all existing open channels to the event store and starts the main
   141  // loop which records channel and peer events, and serves requests for
   142  // information from the store. If this function fails, it cancels its existing
   143  // subscriptions and returns an error.
   144  func (c *ChannelEventStore) Start() error {
   145  	// Create a subscription to channel events.
   146  	channelClient, err := c.cfg.SubscribeChannelEvents()
   147  	if err != nil {
   148  		return err
   149  	}
   150  
   151  	// Create a subscription to peer events. If an error occurs, cancel the
   152  	// existing subscription to channel events and return.
   153  	peerClient, err := c.cfg.SubscribePeerEvents()
   154  	if err != nil {
   155  		channelClient.Cancel()
   156  		return err
   157  	}
   158  
   159  	// cancel should be called to cancel all subscriptions if an error
   160  	// occurs.
   161  	cancel := func() {
   162  		channelClient.Cancel()
   163  		peerClient.Cancel()
   164  	}
   165  
   166  	// Add the existing set of channels to the event store. This is required
   167  	// because channel events will not be triggered for channels that exist
   168  	// at startup time.
   169  	channels, err := c.cfg.GetOpenChannels()
   170  	if err != nil {
   171  		cancel()
   172  		return err
   173  	}
   174  
   175  	log.Infof("Adding %v channels to event store", len(channels))
   176  
   177  	for _, ch := range channels {
   178  		peerKey, err := route.NewVertexFromBytes(
   179  			ch.IdentityPub.SerializeCompressed(),
   180  		)
   181  		if err != nil {
   182  			cancel()
   183  			return err
   184  		}
   185  
   186  		// Add existing channels to the channel store with an initial
   187  		// peer online or offline event.
   188  		c.addChannel(ch.FundingOutpoint, peerKey)
   189  	}
   190  
   191  	// Start a goroutine that consumes events from all subscriptions.
   192  	c.wg.Add(1)
   193  	go c.consume(&subscriptions{
   194  		channelUpdates: channelClient.Updates(),
   195  		peerUpdates:    peerClient.Updates(),
   196  		cancel:         cancel,
   197  	})
   198  
   199  	return nil
   200  }
   201  
   202  // Stop terminates all goroutines started by the event store.
   203  func (c *ChannelEventStore) Stop() {
   204  	log.Info("Stopping event store")
   205  
   206  	// Stop the consume goroutine.
   207  	close(c.quit)
   208  	c.wg.Wait()
   209  
   210  	// Stop the ticker after the goroutine reading from it has exited, to
   211  	// avoid a race.
   212  	c.cfg.FlapCountTicker.Stop()
   213  }
   214  
   215  // addChannel checks whether we are already tracking a channel's peer, creates a
   216  // new peer log to track it if we are not yet monitoring it, and adds the
   217  // channel.
   218  func (c *ChannelEventStore) addChannel(channelPoint wire.OutPoint,
   219  	peer route.Vertex) {
   220  
   221  	peerMonitor, err := c.getPeerMonitor(peer)
   222  	if err != nil {
   223  		log.Error("could not create monitor: %v", err)
   224  		return
   225  	}
   226  
   227  	if err := peerMonitor.addChannel(channelPoint); err != nil {
   228  		log.Errorf("could not add channel: %v", err)
   229  	}
   230  }
   231  
   232  // getPeerMonitor tries to get an existing peer monitor from our in memory list,
   233  // and falls back to creating a new monitor if it is not currently known.
   234  func (c *ChannelEventStore) getPeerMonitor(peer route.Vertex) (peerMonitor,
   235  	error) {
   236  
   237  	peerMonitor, ok := c.peers[peer]
   238  	if ok {
   239  		return peerMonitor, nil
   240  	}
   241  
   242  	var (
   243  		flapCount int
   244  		lastFlap  *time.Time
   245  	)
   246  
   247  	historicalFlap, err := c.cfg.ReadFlapCount(peer)
   248  	switch err {
   249  	// If we do not have any records for this peer we set a 0 flap count
   250  	// and timestamp.
   251  	case channeldb.ErrNoPeerBucket:
   252  
   253  	case nil:
   254  		flapCount = int(historicalFlap.Count)
   255  		lastFlap = &historicalFlap.LastFlap
   256  
   257  	// Return if we get an unexpected error.
   258  	default:
   259  		return nil, err
   260  	}
   261  
   262  	peerMonitor = newPeerLog(c.cfg.Clock, flapCount, lastFlap)
   263  	c.peers[peer] = peerMonitor
   264  
   265  	return peerMonitor, nil
   266  }
   267  
   268  // closeChannel records a closed time for a channel, and returns early is the
   269  // channel is not known to the event store. We log warnings (rather than errors)
   270  // when we cannot find a peer/channel because channels that we restore from a
   271  // static channel backup do not have their open notified, so the event store
   272  // never learns about them, but they are closed using the regular flow so we
   273  // will try to remove them on close. At present, we cannot easily distinguish
   274  // between these closes and others.
   275  func (c *ChannelEventStore) closeChannel(channelPoint wire.OutPoint,
   276  	peer route.Vertex) {
   277  
   278  	peerMonitor, ok := c.peers[peer]
   279  	if !ok {
   280  		log.Warnf("peer not known to store: %v", peer)
   281  		return
   282  	}
   283  
   284  	if err := peerMonitor.removeChannel(channelPoint); err != nil {
   285  		log.Warnf("could not remove channel: %v", err)
   286  	}
   287  }
   288  
   289  // peerEvent creates a peer monitor for a peer if we do not currently have
   290  // one, and adds an online event to it.
   291  func (c *ChannelEventStore) peerEvent(peer route.Vertex, online bool) {
   292  	peerMonitor, err := c.getPeerMonitor(peer)
   293  	if err != nil {
   294  		log.Error("could not create monitor: %v", err)
   295  		return
   296  	}
   297  
   298  	peerMonitor.onlineEvent(online)
   299  }
   300  
   301  // subscriptions abstracts away from subscription clients to allow for mocking.
   302  type subscriptions struct {
   303  	channelUpdates <-chan interface{}
   304  	peerUpdates    <-chan interface{}
   305  	cancel         func()
   306  }
   307  
   308  // consume is the event store's main loop. It consumes subscriptions to update
   309  // the event store with channel and peer events, and serves requests for channel
   310  // uptime and lifespan.
   311  func (c *ChannelEventStore) consume(subscriptions *subscriptions) {
   312  	// Start our flap count ticker.
   313  	c.cfg.FlapCountTicker.Resume()
   314  
   315  	// On exit, we will cancel our subscriptions and write our most recent
   316  	// flap counts to disk. This ensures that we have consistent data in
   317  	// the case of a graceful shutdown. If we do not shutdown gracefully,
   318  	// our worst case is data from our last flap count tick (1H).
   319  	defer func() {
   320  		subscriptions.cancel()
   321  
   322  		if err := c.recordFlapCount(); err != nil {
   323  			log.Errorf("error recording flap on shutdown: %v", err)
   324  		}
   325  
   326  		c.wg.Done()
   327  	}()
   328  
   329  	// Consume events until the channel is closed.
   330  	for {
   331  		select {
   332  		// Process channel opened and closed events.
   333  		case e := <-subscriptions.channelUpdates:
   334  			switch event := e.(type) {
   335  			// A new channel has been opened, we must add the
   336  			// channel to the store and record a channel open event.
   337  			case channelnotifier.OpenChannelEvent:
   338  				compressed := event.Channel.IdentityPub.SerializeCompressed()
   339  				peerKey, err := route.NewVertexFromBytes(
   340  					compressed,
   341  				)
   342  				if err != nil {
   343  					log.Errorf("Could not get vertex "+
   344  						"from: %v", compressed)
   345  				}
   346  
   347  				c.addChannel(
   348  					event.Channel.FundingOutpoint, peerKey,
   349  				)
   350  
   351  			// A channel has been closed, we must remove the channel
   352  			// from the store and record a channel closed event.
   353  			case channelnotifier.ClosedChannelEvent:
   354  				compressed := event.CloseSummary.RemotePub.SerializeCompressed()
   355  				peerKey, err := route.NewVertexFromBytes(
   356  					compressed,
   357  				)
   358  				if err != nil {
   359  					log.Errorf("Could not get vertex "+
   360  						"from: %v", compressed)
   361  					continue
   362  				}
   363  
   364  				c.closeChannel(
   365  					event.CloseSummary.ChanPoint, peerKey,
   366  				)
   367  			}
   368  
   369  		// Process peer online and offline events.
   370  		case e := <-subscriptions.peerUpdates:
   371  			switch event := e.(type) {
   372  			// We have reestablished a connection with our peer,
   373  			// and should record an online event for any channels
   374  			// with that peer.
   375  			case peernotifier.PeerOnlineEvent:
   376  				c.peerEvent(event.PubKey, true)
   377  
   378  			// We have lost a connection with our peer, and should
   379  			// record an offline event for any channels with that
   380  			// peer.
   381  			case peernotifier.PeerOfflineEvent:
   382  				c.peerEvent(event.PubKey, false)
   383  			}
   384  
   385  		// Serve all requests for channel lifetime.
   386  		case req := <-c.chanInfoRequests:
   387  			var resp channelInfoResponse
   388  
   389  			resp.info, resp.err = c.getChanInfo(req)
   390  			req.responseChan <- resp
   391  
   392  		// Serve all requests for information about our peer.
   393  		case req := <-c.peerRequests:
   394  			var resp peerResponse
   395  
   396  			resp.flapCount, resp.ts, resp.err = c.flapCount(
   397  				req.peer,
   398  			)
   399  			req.responseChan <- resp
   400  
   401  		case <-c.cfg.FlapCountTicker.Ticks():
   402  			if err := c.recordFlapCount(); err != nil {
   403  				log.Errorf("could not record flap "+
   404  					"count: %v", err)
   405  			}
   406  
   407  		// Exit if the store receives the signal to shutdown.
   408  		case <-c.quit:
   409  			return
   410  		}
   411  	}
   412  }
   413  
   414  // ChannelInfo provides the set of information that the event store has recorded
   415  // for a channel.
   416  type ChannelInfo struct {
   417  	// Lifetime is the total amount of time we have monitored the channel
   418  	// for.
   419  	Lifetime time.Duration
   420  
   421  	// Uptime is the total amount of time that the channel peer has been
   422  	// observed as online during the monitored lifespan.
   423  	Uptime time.Duration
   424  }
   425  
   426  // GetChanInfo gets all the information we have on a channel in the event store.
   427  func (c *ChannelEventStore) GetChanInfo(channelPoint wire.OutPoint,
   428  	peer route.Vertex) (*ChannelInfo, error) {
   429  
   430  	request := channelInfoRequest{
   431  		peer:         peer,
   432  		channelPoint: channelPoint,
   433  		responseChan: make(chan channelInfoResponse),
   434  	}
   435  
   436  	// Send a request for the channel's information to the main event loop,
   437  	// or return early with an error if the store has already received a
   438  	// shutdown signal.
   439  	select {
   440  	case c.chanInfoRequests <- request:
   441  	case <-c.quit:
   442  		return nil, errShuttingDown
   443  	}
   444  
   445  	// Return the response we receive on the response channel or exit early
   446  	// if the store is instructed to exit.
   447  	select {
   448  	case resp := <-request.responseChan:
   449  		return resp.info, resp.err
   450  
   451  	case <-c.quit:
   452  		return nil, errShuttingDown
   453  	}
   454  }
   455  
   456  // getChanInfo collects channel information for a channel. It gets uptime over
   457  // the full lifetime of the channel.
   458  func (c *ChannelEventStore) getChanInfo(req channelInfoRequest) (*ChannelInfo,
   459  	error) {
   460  
   461  	peerMonitor, ok := c.peers[req.peer]
   462  	if !ok {
   463  		return nil, ErrPeerNotFound
   464  	}
   465  
   466  	lifetime, uptime, err := peerMonitor.channelUptime(req.channelPoint)
   467  	if err != nil {
   468  		return nil, err
   469  	}
   470  
   471  	return &ChannelInfo{
   472  		Lifetime: lifetime,
   473  		Uptime:   uptime,
   474  	}, nil
   475  }
   476  
   477  // FlapCount returns the flap count we have for a peer and the timestamp of its
   478  // last flap. If we do not have any flaps recorded for the peer, the last flap
   479  // timestamp will be nil.
   480  func (c *ChannelEventStore) FlapCount(peer route.Vertex) (int, *time.Time,
   481  	error) {
   482  
   483  	request := peerRequest{
   484  		peer:         peer,
   485  		responseChan: make(chan peerResponse),
   486  	}
   487  
   488  	// Send a request for the peer's information to the main event loop,
   489  	// or return early with an error if the store has already received a
   490  	// shutdown signal.
   491  	select {
   492  	case c.peerRequests <- request:
   493  	case <-c.quit:
   494  		return 0, nil, errShuttingDown
   495  	}
   496  
   497  	// Return the response we receive on the response channel or exit early
   498  	// if the store is instructed to exit.
   499  	select {
   500  	case resp := <-request.responseChan:
   501  		return resp.flapCount, resp.ts, resp.err
   502  
   503  	case <-c.quit:
   504  		return 0, nil, errShuttingDown
   505  	}
   506  }
   507  
   508  // flapCount gets our peer flap count and last flap timestamp from our in memory
   509  // record of a peer, falling back to on disk if we are not currently tracking
   510  // the peer. If we have no flap count recorded for the peer, a nil last flap
   511  // time will be returned.
   512  func (c *ChannelEventStore) flapCount(peer route.Vertex) (int, *time.Time,
   513  	error) {
   514  
   515  	// First check whether we are tracking this peer in memory, because this
   516  	// record will have the most accurate flap count. We do not fail if we
   517  	// can't find the peer in memory, because we may have previously
   518  	// recorded its flap count on disk.
   519  	peerMonitor, ok := c.peers[peer]
   520  	if ok {
   521  		count, ts := peerMonitor.getFlapCount()
   522  		return count, ts, nil
   523  	}
   524  
   525  	// Try to get our flap count from the database. If this value is not
   526  	// recorded, we return a nil last flap time to indicate that we have no
   527  	// record of the peer's flap count.
   528  	flapCount, err := c.cfg.ReadFlapCount(peer)
   529  	switch err {
   530  	case channeldb.ErrNoPeerBucket:
   531  		return 0, nil, nil
   532  
   533  	case nil:
   534  		return int(flapCount.Count), &flapCount.LastFlap, nil
   535  
   536  	default:
   537  		return 0, nil, err
   538  	}
   539  }
   540  
   541  // recordFlapCount will record our flap count for each peer that we are
   542  // currently tracking, skipping peers that have a 0 flap count.
   543  func (c *ChannelEventStore) recordFlapCount() error {
   544  	updates := make(peerFlapCountMap)
   545  
   546  	for peer, monitor := range c.peers {
   547  		flapCount, lastFlap := monitor.getFlapCount()
   548  		if lastFlap == nil {
   549  			continue
   550  		}
   551  
   552  		updates[peer] = &channeldb.FlapCount{
   553  			Count:    uint32(flapCount),
   554  			LastFlap: *lastFlap,
   555  		}
   556  	}
   557  
   558  	log.Debugf("recording flap count for: %v peers", len(updates))
   559  
   560  	return c.cfg.WriteFlapCount(updates)
   561  }