github.com/decred/dcrlnd@v0.7.6/routing/missioncontrol.go

github.com/decred/dcrlnd@v0.7.6/routing/missioncontrol.go (about)

     1  package routing
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"sync"
     7  	"time"
     8  
     9  	"github.com/decred/dcrlnd/channeldb"
    10  	"github.com/decred/dcrlnd/kvdb"
    11  	"github.com/decred/dcrlnd/lnwire"
    12  	"github.com/decred/dcrlnd/routing/route"
    13  )
    14  
    15  const (
    16  	// DefaultPenaltyHalfLife is the default half-life duration. The
    17  	// half-life duration defines after how much time a penalized node or
    18  	// channel is back at 50% probability.
    19  	DefaultPenaltyHalfLife = time.Hour
    20  
    21  	// minSecondChanceInterval is the minimum time required between
    22  	// second-chance failures.
    23  	//
    24  	// If nodes return a channel policy related failure, they may get a
    25  	// second chance to forward the payment. It could be that the channel
    26  	// policy that we are aware of is not up to date. This is especially
    27  	// important in case of mobile apps that are mostly offline.
    28  	//
    29  	// However, we don't want to give nodes the option to endlessly return
    30  	// new channel updates so that we are kept busy trying to route through
    31  	// that node until the payment loop times out.
    32  	//
    33  	// Therefore we only grant a second chance to a node if the previous
    34  	// second chance is sufficiently long ago. This is what
    35  	// minSecondChanceInterval defines. If a second policy failure comes in
    36  	// within that interval, we will apply a penalty.
    37  	//
    38  	// Second chances granted are tracked on the level of node pairs. This
    39  	// means that if a node has multiple channels to the same peer, they
    40  	// will only get a single second chance to route to that peer again.
    41  	// Nodes forward non-strict, so it isn't necessary to apply a less
    42  	// restrictive channel level tracking scheme here.
    43  	minSecondChanceInterval = time.Minute
    44  
    45  	// DefaultMaxMcHistory is the default maximum history size.
    46  	DefaultMaxMcHistory = 1000
    47  
    48  	// DefaultMcFlushInterval is the defaul inteval we use to flush MC state
    49  	// to the database.
    50  	DefaultMcFlushInterval = time.Second
    51  
    52  	// prevSuccessProbability is the assumed probability for node pairs that
    53  	// successfully relayed the previous attempt.
    54  	prevSuccessProbability = 0.95
    55  
    56  	// DefaultAprioriWeight is the default a priori weight. See
    57  	// MissionControlConfig for further explanation.
    58  	DefaultAprioriWeight = 0.5
    59  
    60  	// DefaultMinFailureRelaxInterval is the default minimum time that must
    61  	// have passed since the previously recorded failure before the failure
    62  	// amount may be raised.
    63  	DefaultMinFailureRelaxInterval = time.Minute
    64  )
    65  
    66  var (
    67  	// ErrInvalidMcHistory is returned if we get a negative mission control
    68  	// history count.
    69  	ErrInvalidMcHistory = errors.New("mission control history must be " +
    70  		">= 0")
    71  
    72  	// ErrInvalidFailureInterval is returned if we get an invalid failure
    73  	// interval.
    74  	ErrInvalidFailureInterval = errors.New("failure interval must be >= 0")
    75  )
    76  
    77  // NodeResults contains previous results from a node to its peers.
    78  type NodeResults map[route.Vertex]TimedPairResult
    79  
    80  // MissionControl contains state which summarizes the past attempts of HTLC
    81  // routing by external callers when sending payments throughout the network. It
    82  // acts as a shared memory during routing attempts with the goal to optimize the
    83  // payment attempt success rate.
    84  //
    85  // Failed payment attempts are reported to mission control. These reports are
    86  // used to track the time of the last node or channel level failure. The time
    87  // since the last failure is used to estimate a success probability that is fed
    88  // into the path finding process for subsequent payment attempts.
    89  type MissionControl struct {
    90  	// state is the internal mission control state that is input for
    91  	// probability estimation.
    92  	state *missionControlState
    93  
    94  	// now is expected to return the current time. It is supplied as an
    95  	// external function to enable deterministic unit tests.
    96  	now func() time.Time
    97  
    98  	// selfNode is our pubkey.
    99  	selfNode route.Vertex
   100  
   101  	store *missionControlStore
   102  
   103  	// estimator is the probability estimator that is used with the payment
   104  	// results that mission control collects.
   105  	estimator *probabilityEstimator
   106  
   107  	sync.Mutex
   108  
   109  	// TODO(roasbeef): further counters, if vertex continually unavailable,
   110  	// add to another generation
   111  
   112  	// TODO(roasbeef): also add favorable metrics for nodes
   113  }
   114  
   115  // MissionControlConfig defines parameters that control mission control
   116  // behaviour.
   117  type MissionControlConfig struct {
   118  	// ProbabilityEstimatorConfig is the config we will use for probability
   119  	// calculations.
   120  	ProbabilityEstimatorCfg
   121  
   122  	// MaxMcHistory defines the maximum number of payment results that are
   123  	// held on disk.
   124  	MaxMcHistory int
   125  
   126  	// McFlushInterval defines the ticker interval when we flush the
   127  	// accumulated state to the DB.
   128  	McFlushInterval time.Duration
   129  
   130  	// MinFailureRelaxInterval is the minimum time that must have passed
   131  	// since the previously recorded failure before the failure amount may
   132  	// be raised.
   133  	MinFailureRelaxInterval time.Duration
   134  }
   135  
   136  func (c *MissionControlConfig) validate() error {
   137  	if err := c.ProbabilityEstimatorCfg.validate(); err != nil {
   138  		return err
   139  	}
   140  
   141  	if c.MaxMcHistory < 0 {
   142  		return ErrInvalidMcHistory
   143  	}
   144  
   145  	if c.MinFailureRelaxInterval < 0 {
   146  		return ErrInvalidFailureInterval
   147  	}
   148  
   149  	return nil
   150  }
   151  
   152  // String returns a string representation of a mission control config.
   153  func (c *MissionControlConfig) String() string {
   154  	return fmt.Sprintf("Penalty Half Life: %v, Apriori Hop "+
   155  		"Probablity: %v, Maximum History: %v, Apriori Weight: %v, "+
   156  		"Minimum Failure Relax Interval: %v", c.PenaltyHalfLife,
   157  		c.AprioriHopProbability, c.MaxMcHistory, c.AprioriWeight,
   158  		c.MinFailureRelaxInterval)
   159  }
   160  
   161  // TimedPairResult describes a timestamped pair result.
   162  type TimedPairResult struct {
   163  	// FailTime is the time of the last failure.
   164  	FailTime time.Time
   165  
   166  	// FailAmt is the amount of the last failure. This amount may be pushed
   167  	// up if a later success is higher than the last failed amount.
   168  	FailAmt lnwire.MilliAtom
   169  
   170  	// SuccessTime is the time of the last success.
   171  	SuccessTime time.Time
   172  
   173  	// SuccessAmt is the highest amount that successfully forwarded. This
   174  	// isn't necessarily the last success amount. The value of this field
   175  	// may also be pushed down if a later failure is lower than the highest
   176  	// success amount. Because of this, SuccessAmt may not match
   177  	// SuccessTime.
   178  	SuccessAmt lnwire.MilliAtom
   179  }
   180  
   181  // MissionControlSnapshot contains a snapshot of the current state of mission
   182  // control.
   183  type MissionControlSnapshot struct {
   184  	// Pairs is a list of channels for which specific information is
   185  	// logged.
   186  	Pairs []MissionControlPairSnapshot
   187  }
   188  
   189  // MissionControlPairSnapshot contains a snapshot of the current node pair
   190  // state in mission control.
   191  type MissionControlPairSnapshot struct {
   192  	// Pair is the node pair of which the state is described.
   193  	Pair DirectedNodePair
   194  
   195  	// TimedPairResult contains the data for this pair.
   196  	TimedPairResult
   197  }
   198  
   199  // paymentResult is the information that becomes available when a payment
   200  // attempt completes.
   201  type paymentResult struct {
   202  	id                 uint64
   203  	timeFwd, timeReply time.Time
   204  	route              *route.Route
   205  	success            bool
   206  	failureSourceIdx   *int
   207  	failure            lnwire.FailureMessage
   208  }
   209  
   210  // NewMissionControl returns a new instance of missionControl.
   211  func NewMissionControl(db kvdb.Backend, self route.Vertex,
   212  	cfg *MissionControlConfig) (*MissionControl, error) {
   213  
   214  	log.Debugf("Instantiating mission control with config: %v", cfg)
   215  
   216  	if err := cfg.validate(); err != nil {
   217  		return nil, err
   218  	}
   219  
   220  	store, err := newMissionControlStore(
   221  		db, cfg.MaxMcHistory, cfg.McFlushInterval,
   222  	)
   223  	if err != nil {
   224  		return nil, err
   225  	}
   226  
   227  	estimator := &probabilityEstimator{
   228  		ProbabilityEstimatorCfg: cfg.ProbabilityEstimatorCfg,
   229  		prevSuccessProbability:  prevSuccessProbability,
   230  	}
   231  
   232  	mc := &MissionControl{
   233  		state:     newMissionControlState(cfg.MinFailureRelaxInterval),
   234  		now:       time.Now,
   235  		selfNode:  self,
   236  		store:     store,
   237  		estimator: estimator,
   238  	}
   239  
   240  	if err := mc.init(); err != nil {
   241  		return nil, err
   242  	}
   243  
   244  	return mc, nil
   245  }
   246  
   247  // RunStoreTicker runs the mission control store's ticker.
   248  func (m *MissionControl) RunStoreTicker() {
   249  	m.store.run()
   250  }
   251  
   252  // StopStoreTicker stops the mission control store's ticker.
   253  func (m *MissionControl) StopStoreTicker() {
   254  	m.store.stop()
   255  }
   256  
   257  // init initializes mission control with historical data.
   258  func (m *MissionControl) init() error {
   259  	log.Debugf("Mission control state reconstruction started")
   260  
   261  	start := time.Now()
   262  
   263  	results, err := m.store.fetchAll()
   264  	if err != nil {
   265  		return err
   266  	}
   267  
   268  	for _, result := range results {
   269  		m.applyPaymentResult(result)
   270  	}
   271  
   272  	log.Debugf("Mission control state reconstruction finished: "+
   273  		"n=%v, time=%v", len(results), time.Since(start))
   274  
   275  	return nil
   276  }
   277  
   278  // GetConfig returns the config that mission control is currently configured
   279  // with. All fields are copied by value, so we do not need to worry about
   280  // mutation.
   281  func (m *MissionControl) GetConfig() *MissionControlConfig {
   282  	m.Lock()
   283  	defer m.Unlock()
   284  
   285  	return &MissionControlConfig{
   286  		ProbabilityEstimatorCfg: m.estimator.ProbabilityEstimatorCfg,
   287  		MaxMcHistory:            m.store.maxRecords,
   288  		McFlushInterval:         m.store.flushInterval,
   289  		MinFailureRelaxInterval: m.state.minFailureRelaxInterval,
   290  	}
   291  }
   292  
   293  // SetConfig validates the config provided and updates mission control's config
   294  // if it is valid.
   295  func (m *MissionControl) SetConfig(cfg *MissionControlConfig) error {
   296  	if cfg == nil {
   297  		return errors.New("nil mission control config")
   298  	}
   299  
   300  	if err := cfg.validate(); err != nil {
   301  		return err
   302  	}
   303  
   304  	m.Lock()
   305  	defer m.Unlock()
   306  
   307  	log.Infof("Updating mission control cfg: %v", cfg)
   308  
   309  	m.store.maxRecords = cfg.MaxMcHistory
   310  	m.state.minFailureRelaxInterval = cfg.MinFailureRelaxInterval
   311  	m.estimator.ProbabilityEstimatorCfg = cfg.ProbabilityEstimatorCfg
   312  
   313  	return nil
   314  }
   315  
   316  // ResetHistory resets the history of MissionControl returning it to a state as
   317  // if no payment attempts have been made.
   318  func (m *MissionControl) ResetHistory() error {
   319  	m.Lock()
   320  	defer m.Unlock()
   321  
   322  	if err := m.store.clear(); err != nil {
   323  		return err
   324  	}
   325  
   326  	m.state.resetHistory()
   327  
   328  	log.Debugf("Mission control history cleared")
   329  
   330  	return nil
   331  }
   332  
   333  // GetProbability is expected to return the success probability of a payment
   334  // from fromNode along edge.
   335  func (m *MissionControl) GetProbability(fromNode, toNode route.Vertex,
   336  	amt lnwire.MilliAtom) float64 {
   337  
   338  	m.Lock()
   339  	defer m.Unlock()
   340  
   341  	now := m.now()
   342  	results, _ := m.state.getLastPairResult(fromNode)
   343  
   344  	// Use a distinct probability estimation function for local channels.
   345  	if fromNode == m.selfNode {
   346  		return m.estimator.getLocalPairProbability(now, results, toNode)
   347  	}
   348  
   349  	return m.estimator.getPairProbability(now, results, toNode, amt)
   350  }
   351  
   352  // GetHistorySnapshot takes a snapshot from the current mission control state
   353  // and actual probability estimates.
   354  func (m *MissionControl) GetHistorySnapshot() *MissionControlSnapshot {
   355  	m.Lock()
   356  	defer m.Unlock()
   357  
   358  	log.Debugf("Requesting history snapshot from mission control")
   359  
   360  	return m.state.getSnapshot()
   361  }
   362  
   363  // ImportHistory imports the set of mission control results provided to our
   364  // in-memory state. These results are not persisted, so will not survive
   365  // restarts.
   366  func (m *MissionControl) ImportHistory(history *MissionControlSnapshot,
   367  	force bool) error {
   368  
   369  	if history == nil {
   370  		return errors.New("cannot import nil history")
   371  	}
   372  
   373  	m.Lock()
   374  	defer m.Unlock()
   375  
   376  	log.Infof("Importing history snapshot with %v pairs to mission control",
   377  		len(history.Pairs))
   378  
   379  	imported := m.state.importSnapshot(history, force)
   380  
   381  	log.Infof("Imported %v results to mission control", imported)
   382  
   383  	return nil
   384  }
   385  
   386  // GetPairHistorySnapshot returns the stored history for a given node pair.
   387  func (m *MissionControl) GetPairHistorySnapshot(
   388  	fromNode, toNode route.Vertex) TimedPairResult {
   389  
   390  	m.Lock()
   391  	defer m.Unlock()
   392  
   393  	results, ok := m.state.getLastPairResult(fromNode)
   394  	if !ok {
   395  		return TimedPairResult{}
   396  	}
   397  
   398  	result, ok := results[toNode]
   399  	if !ok {
   400  		return TimedPairResult{}
   401  	}
   402  
   403  	return result
   404  }
   405  
   406  // ReportPaymentFail reports a failed payment to mission control as input for
   407  // future probability estimates. The failureSourceIdx argument indicates the
   408  // failure source. If it is nil, the failure source is unknown. This function
   409  // returns a reason if this failure is a final failure. In that case no further
   410  // payment attempts need to be made.
   411  func (m *MissionControl) ReportPaymentFail(paymentID uint64, rt *route.Route,
   412  	failureSourceIdx *int, failure lnwire.FailureMessage) (
   413  	*channeldb.FailureReason, error) {
   414  
   415  	timestamp := m.now()
   416  
   417  	result := &paymentResult{
   418  		success:          false,
   419  		timeFwd:          timestamp,
   420  		timeReply:        timestamp,
   421  		id:               paymentID,
   422  		failureSourceIdx: failureSourceIdx,
   423  		failure:          failure,
   424  		route:            rt,
   425  	}
   426  
   427  	return m.processPaymentResult(result)
   428  }
   429  
   430  // ReportPaymentSuccess reports a successful payment to mission control as input
   431  // for future probability estimates.
   432  func (m *MissionControl) ReportPaymentSuccess(paymentID uint64,
   433  	rt *route.Route) error {
   434  
   435  	timestamp := m.now()
   436  
   437  	result := &paymentResult{
   438  		timeFwd:   timestamp,
   439  		timeReply: timestamp,
   440  		id:        paymentID,
   441  		success:   true,
   442  		route:     rt,
   443  	}
   444  
   445  	_, err := m.processPaymentResult(result)
   446  	return err
   447  }
   448  
   449  // processPaymentResult stores a payment result in the mission control store and
   450  // updates mission control's in-memory state.
   451  func (m *MissionControl) processPaymentResult(result *paymentResult) (
   452  	*channeldb.FailureReason, error) {
   453  
   454  	// Store complete result in database.
   455  	m.store.AddResult(result)
   456  
   457  	m.Lock()
   458  	defer m.Unlock()
   459  
   460  	// Apply result to update mission control state.
   461  	reason := m.applyPaymentResult(result)
   462  
   463  	return reason, nil
   464  }
   465  
   466  // applyPaymentResult applies a payment result as input for future probability
   467  // estimates. It returns a bool indicating whether this error is a final error
   468  // and no further payment attempts need to be made.
   469  func (m *MissionControl) applyPaymentResult(
   470  	result *paymentResult) *channeldb.FailureReason {
   471  
   472  	// Interpret result.
   473  	i := interpretResult(
   474  		result.route, result.success, result.failureSourceIdx,
   475  		result.failure,
   476  	)
   477  
   478  	if i.policyFailure != nil {
   479  		if m.state.requestSecondChance(
   480  			result.timeReply,
   481  			i.policyFailure.From, i.policyFailure.To,
   482  		) {
   483  			return nil
   484  		}
   485  	}
   486  
   487  	// If there is a node-level failure, record a failure for every tried
   488  	// connection of that node. A node-level failure can be considered as a
   489  	// failure that would have occurred with any of the node's channels.
   490  	//
   491  	// Ideally we'd also record the failure for the untried connections of
   492  	// the node. Unfortunately this would require access to the graph and
   493  	// adding this dependency and db calls does not outweigh the benefits.
   494  	//
   495  	// Untried connections will fall back to the node probability. After the
   496  	// call to setAllPairResult below, the node probability will be equal to
   497  	// the probability of the tried channels except that the a priori
   498  	// probability is mixed in too. This effect is controlled by the
   499  	// aprioriWeight parameter. If that parameter isn't set to an extreme
   500  	// and there are a few known connections, there shouldn't be much of a
   501  	// difference. The largest difference occurs when aprioriWeight is 1. In
   502  	// that case, a node-level failure would not be applied to untried
   503  	// channels.
   504  	if i.nodeFailure != nil {
   505  		log.Debugf("Reporting node failure to Mission Control: "+
   506  			"node=%v", *i.nodeFailure)
   507  
   508  		m.state.setAllFail(*i.nodeFailure, result.timeReply)
   509  	}
   510  
   511  	for pair, pairResult := range i.pairResults {
   512  		pairResult := pairResult
   513  
   514  		if pairResult.success {
   515  			log.Debugf("Reporting pair success to Mission "+
   516  				"Control: pair=%v, amt=%v",
   517  				pair, pairResult.amt)
   518  		} else {
   519  			log.Debugf("Reporting pair failure to Mission "+
   520  				"Control: pair=%v, amt=%v",
   521  				pair, pairResult.amt)
   522  		}
   523  
   524  		m.state.setLastPairResult(
   525  			pair.From, pair.To, result.timeReply, &pairResult, false,
   526  		)
   527  	}
   528  
   529  	return i.finalFailureReason
   530  }