github.com/decred/dcrlnd@v0.7.6/chanfitness/chanevent.go (about) 1 package chanfitness 2 3 import ( 4 "fmt" 5 "time" 6 7 "github.com/decred/dcrd/wire" 8 "github.com/decred/dcrlnd/clock" 9 ) 10 11 type eventType int 12 13 const ( 14 peerOnlineEvent eventType = iota 15 peerOfflineEvent 16 ) 17 18 // String provides string representations of channel events. 19 func (e eventType) String() string { 20 switch e { 21 case peerOnlineEvent: 22 return "peer_online" 23 24 case peerOfflineEvent: 25 return "peer_offline" 26 } 27 28 return "unknown" 29 } 30 31 type event struct { 32 timestamp time.Time 33 eventType eventType 34 } 35 36 // peerLog tracks events for a peer and its channels. If we currently have no 37 // channels with the peer, it will simply track its current online state. If we 38 // do have channels open with the peer, it will track the peer's online and 39 // offline events so that we can calculate uptime for our channels. A single 40 // event log is used for these online and offline events, and uptime for a 41 // channel is calculated by examining a subsection of this log. 42 type peerLog struct { 43 // online stores whether the peer is currently online. 44 online bool 45 46 // onlineEvents is a log of timestamped events observed for the peer 47 // that we have committed to allocating memory to. 48 onlineEvents []*event 49 50 // stagedEvent represents an event that is pending addition to the 51 // events list. It has not yet been added because we rate limit the 52 // frequency that we store events at. We need to store this value 53 // in the log (rather than just ignore events) so that we can flush the 54 // aggregate outcome to our event log once the rate limiting period has 55 // ended. 56 // 57 // Take the following example: 58 // - Peer online event recorded 59 // - Peer offline event, not recorded due to rate limit 60 // - No more events, we incorrectly believe our peer to be online 61 // Instead of skipping events, we stage the most recent event during the 62 // rate limited period so that we know what happened (on aggregate) 63 // while we were rate limiting events. 64 // 65 // Note that we currently only store offline/online events so we can 66 // use this field to track our online state. With the addition of other 67 // event types, we need to only stage online/offline events, or split 68 // them out. 69 stagedEvent *event 70 71 // flapCount is the number of times this peer has been observed as 72 // going offline. 73 flapCount int 74 75 // lastFlap is the timestamp of the last flap we recorded for the peer. 76 // This value will be nil if we have never recorded a flap for the peer. 77 lastFlap *time.Time 78 79 // clock allows creation of deterministic unit tests. 80 clock clock.Clock 81 82 // channels contains a set of currently open channels. Channels will be 83 // added and removed from this map as they are opened and closed. 84 channels map[wire.OutPoint]*channelInfo 85 } 86 87 // newPeerLog creates a log for a peer, taking its historical flap count and 88 // last flap time as parameters. These values may be zero/nil if we have no 89 // record of historical flap count for the peer. 90 func newPeerLog(clock clock.Clock, flapCount int, 91 lastFlap *time.Time) *peerLog { 92 93 return &peerLog{ 94 clock: clock, 95 flapCount: flapCount, 96 lastFlap: lastFlap, 97 channels: make(map[wire.OutPoint]*channelInfo), 98 } 99 } 100 101 // channelInfo contains information about a channel. 102 type channelInfo struct { 103 // openedAt tracks the first time this channel was seen. This is not 104 // necessarily the time that it confirmed on chain because channel 105 // events are not persisted at present. 106 openedAt time.Time 107 } 108 109 func newChannelInfo(openedAt time.Time) *channelInfo { 110 return &channelInfo{ 111 openedAt: openedAt, 112 } 113 } 114 115 // onlineEvent records a peer online or offline event in the log and increments 116 // the peer's flap count. 117 func (p *peerLog) onlineEvent(online bool) { 118 eventTime := p.clock.Now() 119 120 // If we have a non-nil last flap time, potentially apply a cooldown 121 // factor to the peer's flap count before we rate limit it. This allows 122 // us to decrease the penalty for historical flaps over time, provided 123 // the peer has not flapped for a while. 124 if p.lastFlap != nil { 125 p.flapCount = cooldownFlapCount( 126 p.clock.Now(), p.flapCount, *p.lastFlap, 127 ) 128 } 129 130 // Record flap count information and online state regardless of whether 131 // we have any channels open with this peer. 132 p.flapCount++ 133 p.lastFlap = &eventTime 134 p.online = online 135 136 // If we have no channels currently open with the peer, we do not want 137 // to commit resources to tracking their online state beyond a simple 138 // online boolean, so we exit early. 139 if p.channelCount() == 0 { 140 return 141 } 142 143 p.addEvent(online, eventTime) 144 } 145 146 // addEvent records an online or offline event in our event log. and increments 147 // the peer's flap count. 148 func (p *peerLog) addEvent(online bool, time time.Time) { 149 eventType := peerOnlineEvent 150 if !online { 151 eventType = peerOfflineEvent 152 } 153 154 event := &event{ 155 timestamp: time, 156 eventType: eventType, 157 } 158 159 // If we have no staged events, we can just stage this event and return. 160 if p.stagedEvent == nil { 161 p.stagedEvent = event 162 return 163 } 164 165 // We get the amount of time we require between events according to 166 // peer flap count. 167 aggregation := getRateLimit(p.flapCount) 168 nextRecordTime := p.stagedEvent.timestamp.Add(aggregation) 169 flushEvent := nextRecordTime.Before(event.timestamp) 170 171 // If enough time has passed since our last staged event, we add our 172 // event to our in-memory list. 173 if flushEvent { 174 p.onlineEvents = append(p.onlineEvents, p.stagedEvent) 175 } 176 177 // Finally, we replace our staged event with the new event we received. 178 p.stagedEvent = event 179 } 180 181 // addChannel adds a channel to our log. If we have not tracked any online 182 // events for our peer yet, we create one with our peer's current online state 183 // so that we know the state that the peer had at channel start, which is 184 // required to calculate uptime over the channel's lifetime. 185 func (p *peerLog) addChannel(channelPoint wire.OutPoint) error { 186 _, ok := p.channels[channelPoint] 187 if ok { 188 return fmt.Errorf("channel: %v already present", channelPoint) 189 } 190 191 openTime := p.clock.Now() 192 p.channels[channelPoint] = newChannelInfo(openTime) 193 194 // If we do not have any online events tracked for our peer (which is 195 // the case when we have no other channels open with the peer), we add 196 // an event with the peer's current online state so that we know that 197 // starting state for this peer when a channel was connected (which 198 // allows us to calculate uptime over the lifetime of the channel). 199 if len(p.onlineEvents) == 0 { 200 p.addEvent(p.online, openTime) 201 } 202 203 return nil 204 } 205 206 // removeChannel removes a channel from our log. If we have no more channels 207 // with the peer after removing this one, we clear our list of events. 208 func (p *peerLog) removeChannel(channelPoint wire.OutPoint) error { 209 _, ok := p.channels[channelPoint] 210 if !ok { 211 return fmt.Errorf("channel: %v not present", channelPoint) 212 } 213 214 delete(p.channels, channelPoint) 215 216 // If we have no more channels in our event log, we can discard all of 217 // our online events in memory, since we don't need them anymore. 218 // TODO(carla): this could be done on a per channel basis. 219 if p.channelCount() == 0 { 220 p.onlineEvents = nil 221 p.stagedEvent = nil 222 } 223 224 return nil 225 } 226 227 // channelCount returns the number of channels that we currently have 228 // with the peer. 229 func (p *peerLog) channelCount() int { 230 return len(p.channels) 231 } 232 233 // channelUptime looks up a channel and returns the amount of time that the 234 // channel has been monitored for and its uptime over this period. 235 func (p *peerLog) channelUptime(channelPoint wire.OutPoint) (time.Duration, 236 time.Duration, error) { 237 238 channel, ok := p.channels[channelPoint] 239 if !ok { 240 return 0, 0, ErrChannelNotFound 241 } 242 243 now := p.clock.Now() 244 245 uptime, err := p.uptime(channel.openedAt, now) 246 if err != nil { 247 return 0, 0, err 248 } 249 250 return now.Sub(channel.openedAt), uptime, nil 251 } 252 253 // getFlapCount returns the peer's flap count and the timestamp that we last 254 // recorded a flap. 255 func (p *peerLog) getFlapCount() (int, *time.Time) { 256 return p.flapCount, p.lastFlap 257 } 258 259 // listEvents returns all of the events that our event log has tracked, 260 // including events that are staged for addition to our set of events but have 261 // not yet been committed to (because we rate limit and store only the aggregate 262 // outcome over a period). 263 func (p *peerLog) listEvents() []*event { 264 if p.stagedEvent == nil { 265 return p.onlineEvents 266 } 267 268 return append(p.onlineEvents, p.stagedEvent) 269 } 270 271 // onlinePeriod represents a period of time over which a peer was online. 272 type onlinePeriod struct { 273 start, end time.Time 274 } 275 276 // getOnlinePeriods returns a list of all the periods that the event log has 277 // recorded the remote peer as being online. In the unexpected case where there 278 // are no events, the function returns early. Online periods are defined as a 279 // peer online event which is terminated by a peer offline event. If the event 280 // log ends on a peer online event, it appends a final period which is 281 // calculated until the present. This function expects the event log provided 282 // to be ordered by ascending timestamp, and can tolerate multiple consecutive 283 // online or offline events. 284 func (p *peerLog) getOnlinePeriods() []*onlinePeriod { 285 events := p.listEvents() 286 287 // Return early if there are no events, there are no online periods. 288 if len(events) == 0 { 289 return nil 290 } 291 292 var ( 293 // lastEvent tracks the last event that we had that was of 294 // a different type to our own. It is used to determine the 295 // start time of our online periods when we experience an 296 // offline event, and to track our last recorded state. 297 lastEvent *event 298 onlinePeriods []*onlinePeriod 299 ) 300 301 // Loop through all events to build a list of periods that the peer was 302 // online. Online periods are added when they are terminated with a peer 303 // offline event. If the log ends on an online event, the period between 304 // the online event and the present is not tracked. The type of the most 305 // recent event is tracked using the offline bool so that we can add a 306 // final online period if necessary. 307 for _, event := range events { 308 switch event.eventType { 309 case peerOnlineEvent: 310 // If our previous event is nil, we just set it and 311 // break out of the switch. 312 if lastEvent == nil { 313 lastEvent = event 314 break 315 } 316 317 // If our previous event was an offline event, we update 318 // it to this event. We do not do this if it was an 319 // online event because duplicate online events would 320 // progress our online timestamp forward (rather than 321 // keep it at our earliest online event timestamp). 322 if lastEvent.eventType == peerOfflineEvent { 323 lastEvent = event 324 } 325 326 case peerOfflineEvent: 327 // If our previous event is nil, we just set it and 328 // break out of the switch since we cannot record an 329 // online period from this single event. 330 if lastEvent == nil { 331 lastEvent = event 332 break 333 } 334 335 // If the last event we saw was an online event, we 336 // add an online period to our set and progress our 337 // previous event to this offline event. We do not 338 // do this if we have had duplicate offline events 339 // because we would be tracking the most recent offline 340 // event (rather than keep it at our earliest offline 341 // event timestamp). 342 if lastEvent.eventType == peerOnlineEvent { 343 onlinePeriods = append( 344 onlinePeriods, &onlinePeriod{ 345 start: lastEvent.timestamp, 346 end: event.timestamp, 347 }, 348 ) 349 350 lastEvent = event 351 } 352 } 353 } 354 355 // If the last event was an peer offline event, we do not need to 356 // calculate a final online period and can return online periods as is. 357 if lastEvent.eventType == peerOfflineEvent { 358 return onlinePeriods 359 } 360 361 // The log ended on an online event, so we need to add a final online 362 // period which terminates at the present. 363 finalEvent := &onlinePeriod{ 364 start: lastEvent.timestamp, 365 end: p.clock.Now(), 366 } 367 368 // Add the final online period to the set and return. 369 return append(onlinePeriods, finalEvent) 370 } 371 372 // uptime calculates the total uptime we have recorded for a peer over the 373 // inclusive range specified. An error is returned if the end of the range is 374 // before the start or a zero end time is returned. 375 func (p *peerLog) uptime(start, end time.Time) (time.Duration, error) { 376 // Error if we are provided with an invalid range to calculate uptime 377 // for. 378 if end.Before(start) { 379 return 0, fmt.Errorf("end time: %v before start time: %v", 380 end, start) 381 } 382 if end.IsZero() { 383 return 0, fmt.Errorf("zero end time") 384 } 385 386 var uptime time.Duration 387 388 for _, p := range p.getOnlinePeriods() { 389 // The online period ends before the range we're looking at, so 390 // we can skip over it. 391 if p.end.Before(start) { 392 continue 393 } 394 // The online period starts after the range we're looking at, so 395 // can stop calculating uptime. 396 if p.start.After(end) { 397 break 398 } 399 400 // If the online period starts before our range, shift the start 401 // time up so that we only calculate uptime from the start of 402 // our range. 403 if p.start.Before(start) { 404 p.start = start 405 } 406 407 // If the online period ends before our range, shift the end 408 // time forward so that we only calculate uptime until the end 409 // of the range. 410 if p.end.After(end) { 411 p.end = end 412 } 413 414 uptime += p.end.Sub(p.start) 415 } 416 417 return uptime, nil 418 }