github.com/decred/dcrlnd@v0.7.6/chanfitness/chanevent.go

github.com/decred/dcrlnd@v0.7.6/chanfitness/chanevent.go (about)

     1  package chanfitness
     2  
     3  import (
     4  	"fmt"
     5  	"time"
     6  
     7  	"github.com/decred/dcrd/wire"
     8  	"github.com/decred/dcrlnd/clock"
     9  )
    10  
    11  type eventType int
    12  
    13  const (
    14  	peerOnlineEvent eventType = iota
    15  	peerOfflineEvent
    16  )
    17  
    18  // String provides string representations of channel events.
    19  func (e eventType) String() string {
    20  	switch e {
    21  	case peerOnlineEvent:
    22  		return "peer_online"
    23  
    24  	case peerOfflineEvent:
    25  		return "peer_offline"
    26  	}
    27  
    28  	return "unknown"
    29  }
    30  
    31  type event struct {
    32  	timestamp time.Time
    33  	eventType eventType
    34  }
    35  
    36  // peerLog tracks events for a peer and its channels. If we currently have no
    37  // channels with the peer, it will simply track its current online state. If we
    38  // do have channels open with the peer, it will track the peer's online and
    39  // offline events so that we can calculate uptime for our channels. A single
    40  // event log is used for these online and offline events, and uptime for a
    41  // channel is calculated by examining a subsection of this log.
    42  type peerLog struct {
    43  	// online stores whether the peer is currently online.
    44  	online bool
    45  
    46  	// onlineEvents is a log of timestamped events observed for the peer
    47  	// that we have committed to allocating memory to.
    48  	onlineEvents []*event
    49  
    50  	// stagedEvent represents an event that is pending addition to the
    51  	// events list. It has not yet been added because we rate limit the
    52  	// frequency that we store events at. We need to store this value
    53  	// in the log (rather than just ignore events) so that we can flush the
    54  	// aggregate outcome to our event log once the rate limiting period has
    55  	// ended.
    56  	//
    57  	// Take the following example:
    58  	// - Peer online event recorded
    59  	// - Peer offline event, not recorded due to rate limit
    60  	// - No more events, we incorrectly believe our peer to be online
    61  	// Instead of skipping events, we stage the most recent event during the
    62  	// rate limited period so that we know what happened (on aggregate)
    63  	// while we were rate limiting events.
    64  	//
    65  	// Note that we currently only store offline/online events so we can
    66  	// use this field to track our online state. With the addition of other
    67  	// event types, we need to only stage online/offline events, or split
    68  	// them out.
    69  	stagedEvent *event
    70  
    71  	// flapCount is the number of times this peer has been observed as
    72  	// going offline.
    73  	flapCount int
    74  
    75  	// lastFlap is the timestamp of the last flap we recorded for the peer.
    76  	// This value will be nil if we have never recorded a flap for the peer.
    77  	lastFlap *time.Time
    78  
    79  	// clock allows creation of deterministic unit tests.
    80  	clock clock.Clock
    81  
    82  	// channels contains a set of currently open channels. Channels will be
    83  	// added and removed from this map as they are opened and closed.
    84  	channels map[wire.OutPoint]*channelInfo
    85  }
    86  
    87  // newPeerLog creates a log for a peer, taking its historical flap count and
    88  // last flap time as parameters. These values may be zero/nil if we have no
    89  // record of historical flap count for the peer.
    90  func newPeerLog(clock clock.Clock, flapCount int,
    91  	lastFlap *time.Time) *peerLog {
    92  
    93  	return &peerLog{
    94  		clock:     clock,
    95  		flapCount: flapCount,
    96  		lastFlap:  lastFlap,
    97  		channels:  make(map[wire.OutPoint]*channelInfo),
    98  	}
    99  }
   100  
   101  // channelInfo contains information about a channel.
   102  type channelInfo struct {
   103  	// openedAt tracks the first time this channel was seen. This is not
   104  	// necessarily the time that it confirmed on chain because channel
   105  	// events are not persisted at present.
   106  	openedAt time.Time
   107  }
   108  
   109  func newChannelInfo(openedAt time.Time) *channelInfo {
   110  	return &channelInfo{
   111  		openedAt: openedAt,
   112  	}
   113  }
   114  
   115  // onlineEvent records a peer online or offline event in the log and increments
   116  // the peer's flap count.
   117  func (p *peerLog) onlineEvent(online bool) {
   118  	eventTime := p.clock.Now()
   119  
   120  	// If we have a non-nil last flap time, potentially apply a cooldown
   121  	// factor to the peer's flap count before we rate limit it. This allows
   122  	// us to decrease the penalty for historical flaps over time, provided
   123  	// the peer has not flapped for a while.
   124  	if p.lastFlap != nil {
   125  		p.flapCount = cooldownFlapCount(
   126  			p.clock.Now(), p.flapCount, *p.lastFlap,
   127  		)
   128  	}
   129  
   130  	// Record flap count information and online state regardless of whether
   131  	// we have any channels open with this peer.
   132  	p.flapCount++
   133  	p.lastFlap = &eventTime
   134  	p.online = online
   135  
   136  	// If we have no channels currently open with the peer, we do not want
   137  	// to commit resources to tracking their online state beyond a simple
   138  	// online boolean, so we exit early.
   139  	if p.channelCount() == 0 {
   140  		return
   141  	}
   142  
   143  	p.addEvent(online, eventTime)
   144  }
   145  
   146  // addEvent records an online or offline event in our event log. and increments
   147  // the peer's flap count.
   148  func (p *peerLog) addEvent(online bool, time time.Time) {
   149  	eventType := peerOnlineEvent
   150  	if !online {
   151  		eventType = peerOfflineEvent
   152  	}
   153  
   154  	event := &event{
   155  		timestamp: time,
   156  		eventType: eventType,
   157  	}
   158  
   159  	// If we have no staged events, we can just stage this event and return.
   160  	if p.stagedEvent == nil {
   161  		p.stagedEvent = event
   162  		return
   163  	}
   164  
   165  	// We get the amount of time we require between events according to
   166  	// peer flap count.
   167  	aggregation := getRateLimit(p.flapCount)
   168  	nextRecordTime := p.stagedEvent.timestamp.Add(aggregation)
   169  	flushEvent := nextRecordTime.Before(event.timestamp)
   170  
   171  	// If enough time has passed since our last staged event, we add our
   172  	// event to our in-memory list.
   173  	if flushEvent {
   174  		p.onlineEvents = append(p.onlineEvents, p.stagedEvent)
   175  	}
   176  
   177  	// Finally, we replace our staged event with the new event we received.
   178  	p.stagedEvent = event
   179  }
   180  
   181  // addChannel adds a channel to our log. If we have not tracked any online
   182  // events for our peer yet, we create one with our peer's current online state
   183  // so that we know the state that the peer had at channel start, which is
   184  // required to calculate uptime over the channel's lifetime.
   185  func (p *peerLog) addChannel(channelPoint wire.OutPoint) error {
   186  	_, ok := p.channels[channelPoint]
   187  	if ok {
   188  		return fmt.Errorf("channel: %v already present", channelPoint)
   189  	}
   190  
   191  	openTime := p.clock.Now()
   192  	p.channels[channelPoint] = newChannelInfo(openTime)
   193  
   194  	// If we do not have any online events tracked for our peer (which is
   195  	// the case when we have no other channels open with the peer), we add
   196  	// an event with the peer's current online state so that we know that
   197  	// starting state for this peer when a channel was connected (which
   198  	// allows us to calculate uptime over the lifetime of the channel).
   199  	if len(p.onlineEvents) == 0 {
   200  		p.addEvent(p.online, openTime)
   201  	}
   202  
   203  	return nil
   204  }
   205  
   206  // removeChannel removes a channel from our log. If we have no more channels
   207  // with the peer after removing this one, we clear our list of events.
   208  func (p *peerLog) removeChannel(channelPoint wire.OutPoint) error {
   209  	_, ok := p.channels[channelPoint]
   210  	if !ok {
   211  		return fmt.Errorf("channel: %v not present", channelPoint)
   212  	}
   213  
   214  	delete(p.channels, channelPoint)
   215  
   216  	// If we have no more channels in our event log, we can discard all of
   217  	// our online events in memory, since we don't need them anymore.
   218  	// TODO(carla): this could be done on a per channel basis.
   219  	if p.channelCount() == 0 {
   220  		p.onlineEvents = nil
   221  		p.stagedEvent = nil
   222  	}
   223  
   224  	return nil
   225  }
   226  
   227  // channelCount returns the number of channels that we currently have
   228  // with the peer.
   229  func (p *peerLog) channelCount() int {
   230  	return len(p.channels)
   231  }
   232  
   233  // channelUptime looks up a channel and returns the amount of time that the
   234  // channel has been monitored for and its uptime over this period.
   235  func (p *peerLog) channelUptime(channelPoint wire.OutPoint) (time.Duration,
   236  	time.Duration, error) {
   237  
   238  	channel, ok := p.channels[channelPoint]
   239  	if !ok {
   240  		return 0, 0, ErrChannelNotFound
   241  	}
   242  
   243  	now := p.clock.Now()
   244  
   245  	uptime, err := p.uptime(channel.openedAt, now)
   246  	if err != nil {
   247  		return 0, 0, err
   248  	}
   249  
   250  	return now.Sub(channel.openedAt), uptime, nil
   251  }
   252  
   253  // getFlapCount returns the peer's flap count and the timestamp that we last
   254  // recorded a flap.
   255  func (p *peerLog) getFlapCount() (int, *time.Time) {
   256  	return p.flapCount, p.lastFlap
   257  }
   258  
   259  // listEvents returns all of the events that our event log has tracked,
   260  // including events that are staged for addition to our set of events but have
   261  // not yet been committed to (because we rate limit and store only the aggregate
   262  // outcome over a period).
   263  func (p *peerLog) listEvents() []*event {
   264  	if p.stagedEvent == nil {
   265  		return p.onlineEvents
   266  	}
   267  
   268  	return append(p.onlineEvents, p.stagedEvent)
   269  }
   270  
   271  // onlinePeriod represents a period of time over which a peer was online.
   272  type onlinePeriod struct {
   273  	start, end time.Time
   274  }
   275  
   276  // getOnlinePeriods returns a list of all the periods that the event log has
   277  // recorded the remote peer as being online. In the unexpected case where there
   278  // are no events, the function returns early. Online periods are defined as a
   279  // peer online event which is terminated by a peer offline event. If the event
   280  // log ends on a peer online event, it appends a final period which is
   281  // calculated until the present. This function expects the event log provided
   282  // to be ordered by ascending timestamp, and can tolerate multiple consecutive
   283  // online or offline events.
   284  func (p *peerLog) getOnlinePeriods() []*onlinePeriod {
   285  	events := p.listEvents()
   286  
   287  	// Return early if there are no events, there are no online periods.
   288  	if len(events) == 0 {
   289  		return nil
   290  	}
   291  
   292  	var (
   293  		// lastEvent tracks the last event that we had that was of
   294  		// a different type to our own. It is used to determine the
   295  		// start time of our online periods when we experience an
   296  		// offline event, and to track our last recorded state.
   297  		lastEvent     *event
   298  		onlinePeriods []*onlinePeriod
   299  	)
   300  
   301  	// Loop through all events to build a list of periods that the peer was
   302  	// online. Online periods are added when they are terminated with a peer
   303  	// offline event. If the log ends on an online event, the period between
   304  	// the online event and the present is not tracked. The type of the most
   305  	// recent event is tracked using the offline bool so that we can add a
   306  	// final online period if necessary.
   307  	for _, event := range events {
   308  		switch event.eventType {
   309  		case peerOnlineEvent:
   310  			// If our previous event is nil, we just set it and
   311  			// break out of the switch.
   312  			if lastEvent == nil {
   313  				lastEvent = event
   314  				break
   315  			}
   316  
   317  			// If our previous event was an offline event, we update
   318  			// it to this event. We do not do this if it was an
   319  			// online event because duplicate online events would
   320  			// progress our online timestamp forward (rather than
   321  			// keep it at our earliest online event timestamp).
   322  			if lastEvent.eventType == peerOfflineEvent {
   323  				lastEvent = event
   324  			}
   325  
   326  		case peerOfflineEvent:
   327  			// If our previous event is nil, we just set it and
   328  			// break out of the switch since we cannot record an
   329  			// online period from this single event.
   330  			if lastEvent == nil {
   331  				lastEvent = event
   332  				break
   333  			}
   334  
   335  			// If the last event we saw was an online event, we
   336  			// add an online period to our set and progress our
   337  			// previous event to this offline event. We do not
   338  			// do this if we have had duplicate offline events
   339  			// because we would be tracking the most recent offline
   340  			// event (rather than keep it at our earliest offline
   341  			// event timestamp).
   342  			if lastEvent.eventType == peerOnlineEvent {
   343  				onlinePeriods = append(
   344  					onlinePeriods, &onlinePeriod{
   345  						start: lastEvent.timestamp,
   346  						end:   event.timestamp,
   347  					},
   348  				)
   349  
   350  				lastEvent = event
   351  			}
   352  		}
   353  	}
   354  
   355  	// If the last event was an peer offline event, we do not need to
   356  	// calculate a final online period and can return online periods as is.
   357  	if lastEvent.eventType == peerOfflineEvent {
   358  		return onlinePeriods
   359  	}
   360  
   361  	// The log ended on an online event, so we need to add a final online
   362  	// period which terminates at the present.
   363  	finalEvent := &onlinePeriod{
   364  		start: lastEvent.timestamp,
   365  		end:   p.clock.Now(),
   366  	}
   367  
   368  	// Add the final online period to the set and return.
   369  	return append(onlinePeriods, finalEvent)
   370  }
   371  
   372  // uptime calculates the total uptime we have recorded for a peer over the
   373  // inclusive range specified. An error is returned if the end of the range is
   374  // before the start or a zero end time is returned.
   375  func (p *peerLog) uptime(start, end time.Time) (time.Duration, error) {
   376  	// Error if we are provided with an invalid range to calculate uptime
   377  	// for.
   378  	if end.Before(start) {
   379  		return 0, fmt.Errorf("end time: %v before start time: %v",
   380  			end, start)
   381  	}
   382  	if end.IsZero() {
   383  		return 0, fmt.Errorf("zero end time")
   384  	}
   385  
   386  	var uptime time.Duration
   387  
   388  	for _, p := range p.getOnlinePeriods() {
   389  		// The online period ends before the range we're looking at, so
   390  		// we can skip over it.
   391  		if p.end.Before(start) {
   392  			continue
   393  		}
   394  		// The online period starts after the range we're looking at, so
   395  		// can stop calculating uptime.
   396  		if p.start.After(end) {
   397  			break
   398  		}
   399  
   400  		// If the online period starts before our range, shift the start
   401  		// time up so that we only calculate uptime from the start of
   402  		// our range.
   403  		if p.start.Before(start) {
   404  			p.start = start
   405  		}
   406  
   407  		// If the online period ends before our range, shift the end
   408  		// time forward so that we only calculate uptime until the end
   409  		// of the range.
   410  		if p.end.After(end) {
   411  			p.end = end
   412  		}
   413  
   414  		uptime += p.end.Sub(p.start)
   415  	}
   416  
   417  	return uptime, nil
   418  }