github.com/decred/dcrlnd@v0.7.6/routing/missioncontrol.go (about) 1 package routing 2 3 import ( 4 "errors" 5 "fmt" 6 "sync" 7 "time" 8 9 "github.com/decred/dcrlnd/channeldb" 10 "github.com/decred/dcrlnd/kvdb" 11 "github.com/decred/dcrlnd/lnwire" 12 "github.com/decred/dcrlnd/routing/route" 13 ) 14 15 const ( 16 // DefaultPenaltyHalfLife is the default half-life duration. The 17 // half-life duration defines after how much time a penalized node or 18 // channel is back at 50% probability. 19 DefaultPenaltyHalfLife = time.Hour 20 21 // minSecondChanceInterval is the minimum time required between 22 // second-chance failures. 23 // 24 // If nodes return a channel policy related failure, they may get a 25 // second chance to forward the payment. It could be that the channel 26 // policy that we are aware of is not up to date. This is especially 27 // important in case of mobile apps that are mostly offline. 28 // 29 // However, we don't want to give nodes the option to endlessly return 30 // new channel updates so that we are kept busy trying to route through 31 // that node until the payment loop times out. 32 // 33 // Therefore we only grant a second chance to a node if the previous 34 // second chance is sufficiently long ago. This is what 35 // minSecondChanceInterval defines. If a second policy failure comes in 36 // within that interval, we will apply a penalty. 37 // 38 // Second chances granted are tracked on the level of node pairs. This 39 // means that if a node has multiple channels to the same peer, they 40 // will only get a single second chance to route to that peer again. 41 // Nodes forward non-strict, so it isn't necessary to apply a less 42 // restrictive channel level tracking scheme here. 43 minSecondChanceInterval = time.Minute 44 45 // DefaultMaxMcHistory is the default maximum history size. 46 DefaultMaxMcHistory = 1000 47 48 // DefaultMcFlushInterval is the defaul inteval we use to flush MC state 49 // to the database. 50 DefaultMcFlushInterval = time.Second 51 52 // prevSuccessProbability is the assumed probability for node pairs that 53 // successfully relayed the previous attempt. 54 prevSuccessProbability = 0.95 55 56 // DefaultAprioriWeight is the default a priori weight. See 57 // MissionControlConfig for further explanation. 58 DefaultAprioriWeight = 0.5 59 60 // DefaultMinFailureRelaxInterval is the default minimum time that must 61 // have passed since the previously recorded failure before the failure 62 // amount may be raised. 63 DefaultMinFailureRelaxInterval = time.Minute 64 ) 65 66 var ( 67 // ErrInvalidMcHistory is returned if we get a negative mission control 68 // history count. 69 ErrInvalidMcHistory = errors.New("mission control history must be " + 70 ">= 0") 71 72 // ErrInvalidFailureInterval is returned if we get an invalid failure 73 // interval. 74 ErrInvalidFailureInterval = errors.New("failure interval must be >= 0") 75 ) 76 77 // NodeResults contains previous results from a node to its peers. 78 type NodeResults map[route.Vertex]TimedPairResult 79 80 // MissionControl contains state which summarizes the past attempts of HTLC 81 // routing by external callers when sending payments throughout the network. It 82 // acts as a shared memory during routing attempts with the goal to optimize the 83 // payment attempt success rate. 84 // 85 // Failed payment attempts are reported to mission control. These reports are 86 // used to track the time of the last node or channel level failure. The time 87 // since the last failure is used to estimate a success probability that is fed 88 // into the path finding process for subsequent payment attempts. 89 type MissionControl struct { 90 // state is the internal mission control state that is input for 91 // probability estimation. 92 state *missionControlState 93 94 // now is expected to return the current time. It is supplied as an 95 // external function to enable deterministic unit tests. 96 now func() time.Time 97 98 // selfNode is our pubkey. 99 selfNode route.Vertex 100 101 store *missionControlStore 102 103 // estimator is the probability estimator that is used with the payment 104 // results that mission control collects. 105 estimator *probabilityEstimator 106 107 sync.Mutex 108 109 // TODO(roasbeef): further counters, if vertex continually unavailable, 110 // add to another generation 111 112 // TODO(roasbeef): also add favorable metrics for nodes 113 } 114 115 // MissionControlConfig defines parameters that control mission control 116 // behaviour. 117 type MissionControlConfig struct { 118 // ProbabilityEstimatorConfig is the config we will use for probability 119 // calculations. 120 ProbabilityEstimatorCfg 121 122 // MaxMcHistory defines the maximum number of payment results that are 123 // held on disk. 124 MaxMcHistory int 125 126 // McFlushInterval defines the ticker interval when we flush the 127 // accumulated state to the DB. 128 McFlushInterval time.Duration 129 130 // MinFailureRelaxInterval is the minimum time that must have passed 131 // since the previously recorded failure before the failure amount may 132 // be raised. 133 MinFailureRelaxInterval time.Duration 134 } 135 136 func (c *MissionControlConfig) validate() error { 137 if err := c.ProbabilityEstimatorCfg.validate(); err != nil { 138 return err 139 } 140 141 if c.MaxMcHistory < 0 { 142 return ErrInvalidMcHistory 143 } 144 145 if c.MinFailureRelaxInterval < 0 { 146 return ErrInvalidFailureInterval 147 } 148 149 return nil 150 } 151 152 // String returns a string representation of a mission control config. 153 func (c *MissionControlConfig) String() string { 154 return fmt.Sprintf("Penalty Half Life: %v, Apriori Hop "+ 155 "Probablity: %v, Maximum History: %v, Apriori Weight: %v, "+ 156 "Minimum Failure Relax Interval: %v", c.PenaltyHalfLife, 157 c.AprioriHopProbability, c.MaxMcHistory, c.AprioriWeight, 158 c.MinFailureRelaxInterval) 159 } 160 161 // TimedPairResult describes a timestamped pair result. 162 type TimedPairResult struct { 163 // FailTime is the time of the last failure. 164 FailTime time.Time 165 166 // FailAmt is the amount of the last failure. This amount may be pushed 167 // up if a later success is higher than the last failed amount. 168 FailAmt lnwire.MilliAtom 169 170 // SuccessTime is the time of the last success. 171 SuccessTime time.Time 172 173 // SuccessAmt is the highest amount that successfully forwarded. This 174 // isn't necessarily the last success amount. The value of this field 175 // may also be pushed down if a later failure is lower than the highest 176 // success amount. Because of this, SuccessAmt may not match 177 // SuccessTime. 178 SuccessAmt lnwire.MilliAtom 179 } 180 181 // MissionControlSnapshot contains a snapshot of the current state of mission 182 // control. 183 type MissionControlSnapshot struct { 184 // Pairs is a list of channels for which specific information is 185 // logged. 186 Pairs []MissionControlPairSnapshot 187 } 188 189 // MissionControlPairSnapshot contains a snapshot of the current node pair 190 // state in mission control. 191 type MissionControlPairSnapshot struct { 192 // Pair is the node pair of which the state is described. 193 Pair DirectedNodePair 194 195 // TimedPairResult contains the data for this pair. 196 TimedPairResult 197 } 198 199 // paymentResult is the information that becomes available when a payment 200 // attempt completes. 201 type paymentResult struct { 202 id uint64 203 timeFwd, timeReply time.Time 204 route *route.Route 205 success bool 206 failureSourceIdx *int 207 failure lnwire.FailureMessage 208 } 209 210 // NewMissionControl returns a new instance of missionControl. 211 func NewMissionControl(db kvdb.Backend, self route.Vertex, 212 cfg *MissionControlConfig) (*MissionControl, error) { 213 214 log.Debugf("Instantiating mission control with config: %v", cfg) 215 216 if err := cfg.validate(); err != nil { 217 return nil, err 218 } 219 220 store, err := newMissionControlStore( 221 db, cfg.MaxMcHistory, cfg.McFlushInterval, 222 ) 223 if err != nil { 224 return nil, err 225 } 226 227 estimator := &probabilityEstimator{ 228 ProbabilityEstimatorCfg: cfg.ProbabilityEstimatorCfg, 229 prevSuccessProbability: prevSuccessProbability, 230 } 231 232 mc := &MissionControl{ 233 state: newMissionControlState(cfg.MinFailureRelaxInterval), 234 now: time.Now, 235 selfNode: self, 236 store: store, 237 estimator: estimator, 238 } 239 240 if err := mc.init(); err != nil { 241 return nil, err 242 } 243 244 return mc, nil 245 } 246 247 // RunStoreTicker runs the mission control store's ticker. 248 func (m *MissionControl) RunStoreTicker() { 249 m.store.run() 250 } 251 252 // StopStoreTicker stops the mission control store's ticker. 253 func (m *MissionControl) StopStoreTicker() { 254 m.store.stop() 255 } 256 257 // init initializes mission control with historical data. 258 func (m *MissionControl) init() error { 259 log.Debugf("Mission control state reconstruction started") 260 261 start := time.Now() 262 263 results, err := m.store.fetchAll() 264 if err != nil { 265 return err 266 } 267 268 for _, result := range results { 269 m.applyPaymentResult(result) 270 } 271 272 log.Debugf("Mission control state reconstruction finished: "+ 273 "n=%v, time=%v", len(results), time.Since(start)) 274 275 return nil 276 } 277 278 // GetConfig returns the config that mission control is currently configured 279 // with. All fields are copied by value, so we do not need to worry about 280 // mutation. 281 func (m *MissionControl) GetConfig() *MissionControlConfig { 282 m.Lock() 283 defer m.Unlock() 284 285 return &MissionControlConfig{ 286 ProbabilityEstimatorCfg: m.estimator.ProbabilityEstimatorCfg, 287 MaxMcHistory: m.store.maxRecords, 288 McFlushInterval: m.store.flushInterval, 289 MinFailureRelaxInterval: m.state.minFailureRelaxInterval, 290 } 291 } 292 293 // SetConfig validates the config provided and updates mission control's config 294 // if it is valid. 295 func (m *MissionControl) SetConfig(cfg *MissionControlConfig) error { 296 if cfg == nil { 297 return errors.New("nil mission control config") 298 } 299 300 if err := cfg.validate(); err != nil { 301 return err 302 } 303 304 m.Lock() 305 defer m.Unlock() 306 307 log.Infof("Updating mission control cfg: %v", cfg) 308 309 m.store.maxRecords = cfg.MaxMcHistory 310 m.state.minFailureRelaxInterval = cfg.MinFailureRelaxInterval 311 m.estimator.ProbabilityEstimatorCfg = cfg.ProbabilityEstimatorCfg 312 313 return nil 314 } 315 316 // ResetHistory resets the history of MissionControl returning it to a state as 317 // if no payment attempts have been made. 318 func (m *MissionControl) ResetHistory() error { 319 m.Lock() 320 defer m.Unlock() 321 322 if err := m.store.clear(); err != nil { 323 return err 324 } 325 326 m.state.resetHistory() 327 328 log.Debugf("Mission control history cleared") 329 330 return nil 331 } 332 333 // GetProbability is expected to return the success probability of a payment 334 // from fromNode along edge. 335 func (m *MissionControl) GetProbability(fromNode, toNode route.Vertex, 336 amt lnwire.MilliAtom) float64 { 337 338 m.Lock() 339 defer m.Unlock() 340 341 now := m.now() 342 results, _ := m.state.getLastPairResult(fromNode) 343 344 // Use a distinct probability estimation function for local channels. 345 if fromNode == m.selfNode { 346 return m.estimator.getLocalPairProbability(now, results, toNode) 347 } 348 349 return m.estimator.getPairProbability(now, results, toNode, amt) 350 } 351 352 // GetHistorySnapshot takes a snapshot from the current mission control state 353 // and actual probability estimates. 354 func (m *MissionControl) GetHistorySnapshot() *MissionControlSnapshot { 355 m.Lock() 356 defer m.Unlock() 357 358 log.Debugf("Requesting history snapshot from mission control") 359 360 return m.state.getSnapshot() 361 } 362 363 // ImportHistory imports the set of mission control results provided to our 364 // in-memory state. These results are not persisted, so will not survive 365 // restarts. 366 func (m *MissionControl) ImportHistory(history *MissionControlSnapshot, 367 force bool) error { 368 369 if history == nil { 370 return errors.New("cannot import nil history") 371 } 372 373 m.Lock() 374 defer m.Unlock() 375 376 log.Infof("Importing history snapshot with %v pairs to mission control", 377 len(history.Pairs)) 378 379 imported := m.state.importSnapshot(history, force) 380 381 log.Infof("Imported %v results to mission control", imported) 382 383 return nil 384 } 385 386 // GetPairHistorySnapshot returns the stored history for a given node pair. 387 func (m *MissionControl) GetPairHistorySnapshot( 388 fromNode, toNode route.Vertex) TimedPairResult { 389 390 m.Lock() 391 defer m.Unlock() 392 393 results, ok := m.state.getLastPairResult(fromNode) 394 if !ok { 395 return TimedPairResult{} 396 } 397 398 result, ok := results[toNode] 399 if !ok { 400 return TimedPairResult{} 401 } 402 403 return result 404 } 405 406 // ReportPaymentFail reports a failed payment to mission control as input for 407 // future probability estimates. The failureSourceIdx argument indicates the 408 // failure source. If it is nil, the failure source is unknown. This function 409 // returns a reason if this failure is a final failure. In that case no further 410 // payment attempts need to be made. 411 func (m *MissionControl) ReportPaymentFail(paymentID uint64, rt *route.Route, 412 failureSourceIdx *int, failure lnwire.FailureMessage) ( 413 *channeldb.FailureReason, error) { 414 415 timestamp := m.now() 416 417 result := &paymentResult{ 418 success: false, 419 timeFwd: timestamp, 420 timeReply: timestamp, 421 id: paymentID, 422 failureSourceIdx: failureSourceIdx, 423 failure: failure, 424 route: rt, 425 } 426 427 return m.processPaymentResult(result) 428 } 429 430 // ReportPaymentSuccess reports a successful payment to mission control as input 431 // for future probability estimates. 432 func (m *MissionControl) ReportPaymentSuccess(paymentID uint64, 433 rt *route.Route) error { 434 435 timestamp := m.now() 436 437 result := &paymentResult{ 438 timeFwd: timestamp, 439 timeReply: timestamp, 440 id: paymentID, 441 success: true, 442 route: rt, 443 } 444 445 _, err := m.processPaymentResult(result) 446 return err 447 } 448 449 // processPaymentResult stores a payment result in the mission control store and 450 // updates mission control's in-memory state. 451 func (m *MissionControl) processPaymentResult(result *paymentResult) ( 452 *channeldb.FailureReason, error) { 453 454 // Store complete result in database. 455 m.store.AddResult(result) 456 457 m.Lock() 458 defer m.Unlock() 459 460 // Apply result to update mission control state. 461 reason := m.applyPaymentResult(result) 462 463 return reason, nil 464 } 465 466 // applyPaymentResult applies a payment result as input for future probability 467 // estimates. It returns a bool indicating whether this error is a final error 468 // and no further payment attempts need to be made. 469 func (m *MissionControl) applyPaymentResult( 470 result *paymentResult) *channeldb.FailureReason { 471 472 // Interpret result. 473 i := interpretResult( 474 result.route, result.success, result.failureSourceIdx, 475 result.failure, 476 ) 477 478 if i.policyFailure != nil { 479 if m.state.requestSecondChance( 480 result.timeReply, 481 i.policyFailure.From, i.policyFailure.To, 482 ) { 483 return nil 484 } 485 } 486 487 // If there is a node-level failure, record a failure for every tried 488 // connection of that node. A node-level failure can be considered as a 489 // failure that would have occurred with any of the node's channels. 490 // 491 // Ideally we'd also record the failure for the untried connections of 492 // the node. Unfortunately this would require access to the graph and 493 // adding this dependency and db calls does not outweigh the benefits. 494 // 495 // Untried connections will fall back to the node probability. After the 496 // call to setAllPairResult below, the node probability will be equal to 497 // the probability of the tried channels except that the a priori 498 // probability is mixed in too. This effect is controlled by the 499 // aprioriWeight parameter. If that parameter isn't set to an extreme 500 // and there are a few known connections, there shouldn't be much of a 501 // difference. The largest difference occurs when aprioriWeight is 1. In 502 // that case, a node-level failure would not be applied to untried 503 // channels. 504 if i.nodeFailure != nil { 505 log.Debugf("Reporting node failure to Mission Control: "+ 506 "node=%v", *i.nodeFailure) 507 508 m.state.setAllFail(*i.nodeFailure, result.timeReply) 509 } 510 511 for pair, pairResult := range i.pairResults { 512 pairResult := pairResult 513 514 if pairResult.success { 515 log.Debugf("Reporting pair success to Mission "+ 516 "Control: pair=%v, amt=%v", 517 pair, pairResult.amt) 518 } else { 519 log.Debugf("Reporting pair failure to Mission "+ 520 "Control: pair=%v, amt=%v", 521 pair, pairResult.amt) 522 } 523 524 m.state.setLastPairResult( 525 pair.From, pair.To, result.timeReply, &pairResult, false, 526 ) 527 } 528 529 return i.finalFailureReason 530 }