code.vegaprotocol.io/vega@v0.79.0/core/validators/witness.go (about)

     1  // Copyright (C) 2023 Gobalsky Labs Limited
     2  //
     3  // This program is free software: you can redistribute it and/or modify
     4  // it under the terms of the GNU Affero General Public License as
     5  // published by the Free Software Foundation, either version 3 of the
     6  // License, or (at your option) any later version.
     7  //
     8  // This program is distributed in the hope that it will be useful,
     9  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    10  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    11  // GNU Affero General Public License for more details.
    12  //
    13  // You should have received a copy of the GNU Affero General Public License
    14  // along with this program.  If not, see <http://www.gnu.org/licenses/>.
    15  
    16  package validators
    17  
    18  import (
    19  	"context"
    20  	"errors"
    21  	"fmt"
    22  	"math/rand"
    23  	"sort"
    24  	"strconv"
    25  	"sync"
    26  	"sync/atomic"
    27  	"time"
    28  
    29  	"code.vegaprotocol.io/vega/core/txn"
    30  	"code.vegaprotocol.io/vega/libs/crypto"
    31  	"code.vegaprotocol.io/vega/libs/num"
    32  	"code.vegaprotocol.io/vega/logging"
    33  	commandspb "code.vegaprotocol.io/vega/protos/vega/commands/v1"
    34  
    35  	"github.com/cenkalti/backoff"
    36  	"github.com/golang/protobuf/proto"
    37  )
    38  
    39  var (
    40  	ErrResourceDuplicate            = errors.New("resource duplicate")
    41  	ErrCheckUntilInvalid            = errors.New("invalid time to check until")
    42  	ErrInvalidResourceIDForNodeVote = errors.New("invalid resource ID")
    43  	ErrVoteFromNonValidator         = errors.New("vote from non validator")
    44  	ErrDuplicateVoteFromNode        = errors.New("duplicate vote from node")
    45  )
    46  
    47  type TimeService interface {
    48  	GetTimeNow() time.Time
    49  }
    50  
    51  type Commander interface {
    52  	Command(ctx context.Context, cmd txn.Command, payload proto.Message, f func(string, error), bo *backoff.ExponentialBackOff)
    53  	CommandSync(ctx context.Context, cmd txn.Command, payload proto.Message, f func(string, error), bo *backoff.ExponentialBackOff)
    54  }
    55  
    56  type ValidatorTopology interface {
    57  	IsValidator() bool
    58  	SelfVegaPubKey() string
    59  	AllVegaPubKeys() []string
    60  	IsValidatorVegaPubKey(string) bool
    61  	IsTendermintValidator(string) bool
    62  	GetVotingPower(pubkey string) int64
    63  	GetTotalVotingPower() int64
    64  }
    65  
    66  type Resource interface {
    67  	GetID() string
    68  	GetType() commandspb.NodeVote_Type
    69  	Check(ctx context.Context) error
    70  	GetChainID() string
    71  }
    72  
    73  const (
    74  	notValidated uint32 = iota
    75  	validated
    76  	voteSent
    77  )
    78  
    79  const (
    80  	minValidationPeriod = 1                   // sec minutes
    81  	maxValidationPeriod = 30 * 24 * time.Hour // 30 days
    82  	// by default all validators needs to sign.
    83  )
    84  
    85  var defaultValidatorsVoteRequired = num.MustDecimalFromString("1.0")
    86  
    87  func init() {
    88  	// we seed the random generator just in case
    89  	// as the backoff library use random internally
    90  	// TODO this probably needs to change to something that can be agreed across all nodes.
    91  	rand.Seed(time.Now().UnixNano())
    92  }
    93  
    94  type res struct {
    95  	res Resource
    96  	// how long to run the check
    97  	checkUntil time.Time
    98  	mu         sync.Mutex
    99  	votes      map[string]struct{} // checks vote sent by the nodes
   100  	// the stated of the checking
   101  	state atomic.Uint32
   102  	// the context used to notify the routine to exit
   103  	cfunc context.CancelFunc
   104  	// the function to call one validation is done
   105  	cb           func(interface{}, bool)
   106  	lastSentVote time.Time
   107  }
   108  
   109  func (r *res) addVote(key string) error {
   110  	r.mu.Lock()
   111  	defer r.mu.Unlock()
   112  
   113  	if _, ok := r.votes[key]; ok {
   114  		return ErrDuplicateVoteFromNode
   115  	}
   116  
   117  	// add the vote
   118  	r.votes[key] = struct{}{}
   119  	return nil
   120  }
   121  
   122  func (r *res) selfVoteReceived(self string) bool {
   123  	r.mu.Lock()
   124  	defer r.mu.Unlock()
   125  
   126  	_, ok := r.votes[self]
   127  	return ok
   128  }
   129  
   130  func (r *res) votePassed(t ValidatorTopology, requiredMajority num.Decimal) bool {
   131  	r.mu.Lock()
   132  	defer r.mu.Unlock()
   133  
   134  	count := int64(0)
   135  	for k := range r.votes {
   136  		if t.IsTendermintValidator(k) {
   137  			count += t.GetVotingPower(k)
   138  		}
   139  	}
   140  
   141  	return num.DecimalFromInt64(count).Div(num.DecimalFromInt64(t.GetTotalVotingPower())).GreaterThanOrEqual(requiredMajority)
   142  }
   143  
   144  type Witness struct {
   145  	log *logging.Logger
   146  	cfg Config
   147  	ctx context.Context
   148  	now time.Time
   149  	top ValidatorTopology
   150  	cmd Commander
   151  
   152  	resources map[string]*res
   153  	// handle sending transaction errors
   154  	needResendMu  sync.Mutex
   155  	needResendRes map[string]struct{}
   156  
   157  	validatorVotesRequired num.Decimal
   158  	wss                    *witnessSnapshotState
   159  
   160  	defaultConfirmations map[string]int64
   161  	approxBlockTime      map[string]time.Duration
   162  }
   163  
   164  func NewWitness(ctx context.Context, log *logging.Logger, cfg Config, top ValidatorTopology, cmd Commander, tsvc TimeService) (w *Witness) {
   165  	log = log.Named(namedLogger)
   166  	log.SetLevel(cfg.Level.Get())
   167  
   168  	return &Witness{
   169  		ctx:                    ctx,
   170  		log:                    log,
   171  		cfg:                    cfg,
   172  		now:                    tsvc.GetTimeNow(),
   173  		cmd:                    cmd,
   174  		top:                    top,
   175  		resources:              map[string]*res{},
   176  		needResendRes:          map[string]struct{}{},
   177  		validatorVotesRequired: defaultValidatorsVoteRequired,
   178  		defaultConfirmations:   map[string]int64{},
   179  		approxBlockTime:        map[string]time.Duration{},
   180  		wss: &witnessSnapshotState{
   181  			serialised: []byte{},
   182  		},
   183  	}
   184  }
   185  
   186  func (w *Witness) SetPrimaryDefaultConfirmations(chainID string, c uint64) {
   187  	w.defaultConfirmations[chainID] = int64(c)
   188  	w.approxBlockTime[chainID] = w.cfg.ApproxEthereumBlockTime.Duration
   189  }
   190  
   191  func (w *Witness) SetSecondaryDefaultConfirmations(chainID string, c uint64, bt time.Duration) {
   192  	w.defaultConfirmations[chainID] = int64(c)
   193  	w.approxBlockTime[chainID] = bt
   194  }
   195  
   196  func (w *Witness) OnDefaultValidatorsVoteRequiredUpdate(ctx context.Context, d num.Decimal) error {
   197  	w.validatorVotesRequired = d
   198  	return nil
   199  }
   200  
   201  // ReloadConf updates the internal configuration.
   202  func (w *Witness) ReloadConf(cfg Config) {
   203  	w.log.Info("reloading configuration")
   204  	if w.log.GetLevel() != cfg.Level.Get() {
   205  		w.log.Info("updating log level",
   206  			logging.String("old", w.log.GetLevel().String()),
   207  			logging.String("new", cfg.Level.String()),
   208  		)
   209  		w.log.SetLevel(cfg.Level.Get())
   210  	}
   211  
   212  	w.cfg = cfg
   213  }
   214  
   215  func (w *Witness) Stop() {
   216  	// cancelling all context of checks which might be running
   217  	for _, v := range w.resources {
   218  		v.cfunc()
   219  	}
   220  }
   221  
   222  // AddNodeCheck registers a vote from a validator node for a given resource.
   223  func (w *Witness) AddNodeCheck(_ context.Context, nv *commandspb.NodeVote, key crypto.PublicKey) error {
   224  	// get the node proposal first
   225  	r, ok := w.resources[nv.Reference]
   226  	if !ok {
   227  		w.log.Error("invalid resource ID received for vote",
   228  			logging.String("resource-ref", nv.Reference),
   229  			logging.String("node-id", key.Hex()),
   230  		)
   231  		return ErrInvalidResourceIDForNodeVote
   232  	}
   233  
   234  	// ensure the node is a validator
   235  	if !w.top.IsValidatorVegaPubKey(key.Hex()) {
   236  		w.log.Error("non-validator node tried to register node vote",
   237  			logging.String("node-id", key.Hex()))
   238  		return ErrVoteFromNonValidator
   239  	}
   240  
   241  	return r.addVote(key.Hex())
   242  }
   243  
   244  func (w *Witness) StartCheck(
   245  	r Resource,
   246  	cb func(interface{}, bool),
   247  	checkUntil time.Time,
   248  ) error {
   249  	return w.startCheck(r, cb, checkUntil, w.defaultConfirmations[r.GetChainID()])
   250  }
   251  
   252  func (w *Witness) StartCheckWithDelay(
   253  	r Resource,
   254  	cb func(interface{}, bool),
   255  	checkUntil time.Time,
   256  	initialDelay int64,
   257  ) error {
   258  	return w.startCheck(r, cb, checkUntil, initialDelay)
   259  }
   260  
   261  func (w *Witness) startCheck(
   262  	r Resource,
   263  	cb func(interface{}, bool),
   264  	checkUntil time.Time,
   265  	initialDelay int64,
   266  ) error {
   267  	id := r.GetID()
   268  	if _, ok := w.resources[id]; ok {
   269  		return ErrResourceDuplicate
   270  	}
   271  
   272  	if err := w.validateCheckUntil(checkUntil); err != nil {
   273  		return err
   274  	}
   275  
   276  	ctx, cfunc := context.WithDeadline(w.ctx, checkUntil)
   277  	rs := &res{
   278  		res:        r,
   279  		checkUntil: checkUntil,
   280  		state:      atomic.Uint32{},
   281  		cfunc:      cfunc,
   282  		cb:         cb,
   283  		votes:      map[string]struct{}{},
   284  	}
   285  	rs.state.Store(notValidated)
   286  
   287  	w.resources[id] = rs
   288  
   289  	// if we are a validator, we just start the routine.
   290  	// so we can ensure the resources exists
   291  	if w.top.IsValidator() {
   292  		go w.start(ctx, rs, &initialDelay)
   293  	} else {
   294  		// if not a validator, we just jump to the state voteSent
   295  		// and will wait for all validator to approve basically.
   296  		// check succeeded
   297  		rs.state.Store(voteSent)
   298  	}
   299  	return nil
   300  }
   301  
   302  func (w *Witness) validateCheckUntil(checkUntil time.Time) error {
   303  	minValid, maxValid := w.now.Add(minValidationPeriod),
   304  		w.now.Add(maxValidationPeriod)
   305  	if checkUntil.Unix() < minValid.Unix() || checkUntil.Unix() > maxValid.Unix() {
   306  		if w.log.GetLevel() <= logging.DebugLevel {
   307  			w.log.Debug("invalid duration for witness",
   308  				logging.Time("check-until", checkUntil),
   309  				logging.Time("min-valid", minValid),
   310  				logging.Time("max-valid", maxValid),
   311  			)
   312  		}
   313  		return ErrCheckUntilInvalid
   314  	}
   315  	return nil
   316  }
   317  
   318  func newBackoff(ctx context.Context, maxElapsedTime time.Duration) backoff.BackOff {
   319  	bo := backoff.NewExponentialBackOff()
   320  	bo.MaxElapsedTime = maxElapsedTime
   321  	bo.InitialInterval = 1 * time.Second
   322  	return backoff.WithContext(bo, ctx)
   323  }
   324  
   325  func (w *Witness) start(ctx context.Context, r *res, initialDelay *int64) {
   326  	if initialDelay != nil {
   327  		t := time.NewTimer(time.Duration(*initialDelay) * w.approxBlockTime[r.res.GetChainID()])
   328  		<-t.C
   329  		t.Stop()
   330  	}
   331  
   332  	backff := newBackoff(ctx, r.checkUntil.Sub(w.now))
   333  	f := func() error {
   334  		w.log.Debug("Checking the resource", logging.String("asset-source", r.res.GetID()))
   335  
   336  		if err := r.res.Check(ctx); err != nil {
   337  			w.log.Error("Checking the resource failed", logging.Error(err))
   338  			return err
   339  		}
   340  		return nil
   341  	}
   342  
   343  	if err := backoff.Retry(f, backff); err != nil {
   344  		return
   345  	}
   346  
   347  	// check succeeded
   348  	r.state.Store(validated)
   349  }
   350  
   351  func (w *Witness) OnTick(ctx context.Context, t time.Time) {
   352  	w.now = t
   353  	isValidator := w.top.IsValidator()
   354  
   355  	// sort resources first
   356  	resourceIDs := make([]string, 0, len(w.resources))
   357  	for k := range w.resources {
   358  		resourceIDs = append(resourceIDs, k)
   359  	}
   360  	sort.Strings(resourceIDs)
   361  
   362  	// check if any resources passed checks
   363  	for _, k := range resourceIDs {
   364  		v := w.resources[k]
   365  
   366  		state := v.state.Load()
   367  		checkPass := v.votePassed(w.top, w.validatorVotesRequired)
   368  
   369  		// if the time is expired, or we received enough votes
   370  		if v.checkUntil.Before(t) || checkPass {
   371  			// cancel the context so it stops the routine right now
   372  			v.cfunc()
   373  
   374  			if !checkPass {
   375  				votesReceived := []string{}
   376  				votesMissing := []string{}
   377  				votePowers := []string{}
   378  				for _, k := range w.top.AllVegaPubKeys() {
   379  					if !w.top.IsTendermintValidator(k) {
   380  						continue
   381  					}
   382  					if _, ok := v.votes[k]; ok {
   383  						votesReceived = append(votesReceived, k)
   384  						votePowers = append(votePowers, strconv.FormatInt(w.top.GetVotingPower(k), 10))
   385  						continue
   386  					}
   387  					votesMissing = append(votesMissing, k)
   388  				}
   389  				w.log.Warn("resource checking was not validated by all nodes",
   390  					logging.String("resource-id", v.res.GetID()),
   391  					logging.Strings("votes-received", votesReceived),
   392  					logging.Strings("votes-missing", votesMissing),
   393  					logging.Strings("votes-power-received", votePowers),
   394  					logging.Int64("total-voting-power", w.top.GetTotalVotingPower()),
   395  				)
   396  			}
   397  
   398  			// callback to the resource holder
   399  			v.cb(v.res, checkPass)
   400  			// we delete the resource from our map.
   401  			delete(w.resources, k)
   402  			continue
   403  		}
   404  
   405  		// if we are a validator, and the resource was validated
   406  		// then we try to send our vote.
   407  		if isValidator && state == validated || w.needResend(k) {
   408  			v.lastSentVote = t
   409  			nv := &commandspb.NodeVote{
   410  				Reference: v.res.GetID(),
   411  				Type:      v.res.GetType(),
   412  			}
   413  			w.cmd.Command(ctx, txn.NodeVoteCommand, nv, w.onCommandSent(k), nil)
   414  			// set new state so we do not try to validate again
   415  			v.state.Store(voteSent)
   416  		} else if (isValidator && state == voteSent) && t.After(v.lastSentVote.Add(w.cfg.NodeVoteResendInterval.Duration)) {
   417  			if v.selfVoteReceived(w.top.SelfVegaPubKey()) {
   418  				continue
   419  			}
   420  			w.onCommandSent(v.res.GetID())("", fmt.Errorf("no self votes received after %s", w.cfg.NodeVoteResendInterval.Duration.String()))
   421  		}
   422  	}
   423  }
   424  
   425  func (w *Witness) needResend(res string) bool {
   426  	w.needResendMu.Lock()
   427  	defer w.needResendMu.Unlock()
   428  	if _, ok := w.needResendRes[res]; ok {
   429  		delete(w.needResendRes, res)
   430  		return true
   431  	}
   432  	return false
   433  }
   434  
   435  func (w *Witness) onCommandSent(res string) func(string, error) {
   436  	return func(_ string, err error) {
   437  		if err != nil {
   438  			w.log.Error("could not send command", logging.String("res-id", res), logging.Error(err))
   439  			w.needResendMu.Lock()
   440  			defer w.needResendMu.Unlock()
   441  			w.needResendRes[res] = struct{}{}
   442  		}
   443  	}
   444  }