code.vegaprotocol.io/vega@v0.79.0/core/validators/witness.go (about) 1 // Copyright (C) 2023 Gobalsky Labs Limited 2 // 3 // This program is free software: you can redistribute it and/or modify 4 // it under the terms of the GNU Affero General Public License as 5 // published by the Free Software Foundation, either version 3 of the 6 // License, or (at your option) any later version. 7 // 8 // This program is distributed in the hope that it will be useful, 9 // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 // GNU Affero General Public License for more details. 12 // 13 // You should have received a copy of the GNU Affero General Public License 14 // along with this program. If not, see <http://www.gnu.org/licenses/>. 15 16 package validators 17 18 import ( 19 "context" 20 "errors" 21 "fmt" 22 "math/rand" 23 "sort" 24 "strconv" 25 "sync" 26 "sync/atomic" 27 "time" 28 29 "code.vegaprotocol.io/vega/core/txn" 30 "code.vegaprotocol.io/vega/libs/crypto" 31 "code.vegaprotocol.io/vega/libs/num" 32 "code.vegaprotocol.io/vega/logging" 33 commandspb "code.vegaprotocol.io/vega/protos/vega/commands/v1" 34 35 "github.com/cenkalti/backoff" 36 "github.com/golang/protobuf/proto" 37 ) 38 39 var ( 40 ErrResourceDuplicate = errors.New("resource duplicate") 41 ErrCheckUntilInvalid = errors.New("invalid time to check until") 42 ErrInvalidResourceIDForNodeVote = errors.New("invalid resource ID") 43 ErrVoteFromNonValidator = errors.New("vote from non validator") 44 ErrDuplicateVoteFromNode = errors.New("duplicate vote from node") 45 ) 46 47 type TimeService interface { 48 GetTimeNow() time.Time 49 } 50 51 type Commander interface { 52 Command(ctx context.Context, cmd txn.Command, payload proto.Message, f func(string, error), bo *backoff.ExponentialBackOff) 53 CommandSync(ctx context.Context, cmd txn.Command, payload proto.Message, f func(string, error), bo *backoff.ExponentialBackOff) 54 } 55 56 type ValidatorTopology interface { 57 IsValidator() bool 58 SelfVegaPubKey() string 59 AllVegaPubKeys() []string 60 IsValidatorVegaPubKey(string) bool 61 IsTendermintValidator(string) bool 62 GetVotingPower(pubkey string) int64 63 GetTotalVotingPower() int64 64 } 65 66 type Resource interface { 67 GetID() string 68 GetType() commandspb.NodeVote_Type 69 Check(ctx context.Context) error 70 GetChainID() string 71 } 72 73 const ( 74 notValidated uint32 = iota 75 validated 76 voteSent 77 ) 78 79 const ( 80 minValidationPeriod = 1 // sec minutes 81 maxValidationPeriod = 30 * 24 * time.Hour // 30 days 82 // by default all validators needs to sign. 83 ) 84 85 var defaultValidatorsVoteRequired = num.MustDecimalFromString("1.0") 86 87 func init() { 88 // we seed the random generator just in case 89 // as the backoff library use random internally 90 // TODO this probably needs to change to something that can be agreed across all nodes. 91 rand.Seed(time.Now().UnixNano()) 92 } 93 94 type res struct { 95 res Resource 96 // how long to run the check 97 checkUntil time.Time 98 mu sync.Mutex 99 votes map[string]struct{} // checks vote sent by the nodes 100 // the stated of the checking 101 state atomic.Uint32 102 // the context used to notify the routine to exit 103 cfunc context.CancelFunc 104 // the function to call one validation is done 105 cb func(interface{}, bool) 106 lastSentVote time.Time 107 } 108 109 func (r *res) addVote(key string) error { 110 r.mu.Lock() 111 defer r.mu.Unlock() 112 113 if _, ok := r.votes[key]; ok { 114 return ErrDuplicateVoteFromNode 115 } 116 117 // add the vote 118 r.votes[key] = struct{}{} 119 return nil 120 } 121 122 func (r *res) selfVoteReceived(self string) bool { 123 r.mu.Lock() 124 defer r.mu.Unlock() 125 126 _, ok := r.votes[self] 127 return ok 128 } 129 130 func (r *res) votePassed(t ValidatorTopology, requiredMajority num.Decimal) bool { 131 r.mu.Lock() 132 defer r.mu.Unlock() 133 134 count := int64(0) 135 for k := range r.votes { 136 if t.IsTendermintValidator(k) { 137 count += t.GetVotingPower(k) 138 } 139 } 140 141 return num.DecimalFromInt64(count).Div(num.DecimalFromInt64(t.GetTotalVotingPower())).GreaterThanOrEqual(requiredMajority) 142 } 143 144 type Witness struct { 145 log *logging.Logger 146 cfg Config 147 ctx context.Context 148 now time.Time 149 top ValidatorTopology 150 cmd Commander 151 152 resources map[string]*res 153 // handle sending transaction errors 154 needResendMu sync.Mutex 155 needResendRes map[string]struct{} 156 157 validatorVotesRequired num.Decimal 158 wss *witnessSnapshotState 159 160 defaultConfirmations map[string]int64 161 approxBlockTime map[string]time.Duration 162 } 163 164 func NewWitness(ctx context.Context, log *logging.Logger, cfg Config, top ValidatorTopology, cmd Commander, tsvc TimeService) (w *Witness) { 165 log = log.Named(namedLogger) 166 log.SetLevel(cfg.Level.Get()) 167 168 return &Witness{ 169 ctx: ctx, 170 log: log, 171 cfg: cfg, 172 now: tsvc.GetTimeNow(), 173 cmd: cmd, 174 top: top, 175 resources: map[string]*res{}, 176 needResendRes: map[string]struct{}{}, 177 validatorVotesRequired: defaultValidatorsVoteRequired, 178 defaultConfirmations: map[string]int64{}, 179 approxBlockTime: map[string]time.Duration{}, 180 wss: &witnessSnapshotState{ 181 serialised: []byte{}, 182 }, 183 } 184 } 185 186 func (w *Witness) SetPrimaryDefaultConfirmations(chainID string, c uint64) { 187 w.defaultConfirmations[chainID] = int64(c) 188 w.approxBlockTime[chainID] = w.cfg.ApproxEthereumBlockTime.Duration 189 } 190 191 func (w *Witness) SetSecondaryDefaultConfirmations(chainID string, c uint64, bt time.Duration) { 192 w.defaultConfirmations[chainID] = int64(c) 193 w.approxBlockTime[chainID] = bt 194 } 195 196 func (w *Witness) OnDefaultValidatorsVoteRequiredUpdate(ctx context.Context, d num.Decimal) error { 197 w.validatorVotesRequired = d 198 return nil 199 } 200 201 // ReloadConf updates the internal configuration. 202 func (w *Witness) ReloadConf(cfg Config) { 203 w.log.Info("reloading configuration") 204 if w.log.GetLevel() != cfg.Level.Get() { 205 w.log.Info("updating log level", 206 logging.String("old", w.log.GetLevel().String()), 207 logging.String("new", cfg.Level.String()), 208 ) 209 w.log.SetLevel(cfg.Level.Get()) 210 } 211 212 w.cfg = cfg 213 } 214 215 func (w *Witness) Stop() { 216 // cancelling all context of checks which might be running 217 for _, v := range w.resources { 218 v.cfunc() 219 } 220 } 221 222 // AddNodeCheck registers a vote from a validator node for a given resource. 223 func (w *Witness) AddNodeCheck(_ context.Context, nv *commandspb.NodeVote, key crypto.PublicKey) error { 224 // get the node proposal first 225 r, ok := w.resources[nv.Reference] 226 if !ok { 227 w.log.Error("invalid resource ID received for vote", 228 logging.String("resource-ref", nv.Reference), 229 logging.String("node-id", key.Hex()), 230 ) 231 return ErrInvalidResourceIDForNodeVote 232 } 233 234 // ensure the node is a validator 235 if !w.top.IsValidatorVegaPubKey(key.Hex()) { 236 w.log.Error("non-validator node tried to register node vote", 237 logging.String("node-id", key.Hex())) 238 return ErrVoteFromNonValidator 239 } 240 241 return r.addVote(key.Hex()) 242 } 243 244 func (w *Witness) StartCheck( 245 r Resource, 246 cb func(interface{}, bool), 247 checkUntil time.Time, 248 ) error { 249 return w.startCheck(r, cb, checkUntil, w.defaultConfirmations[r.GetChainID()]) 250 } 251 252 func (w *Witness) StartCheckWithDelay( 253 r Resource, 254 cb func(interface{}, bool), 255 checkUntil time.Time, 256 initialDelay int64, 257 ) error { 258 return w.startCheck(r, cb, checkUntil, initialDelay) 259 } 260 261 func (w *Witness) startCheck( 262 r Resource, 263 cb func(interface{}, bool), 264 checkUntil time.Time, 265 initialDelay int64, 266 ) error { 267 id := r.GetID() 268 if _, ok := w.resources[id]; ok { 269 return ErrResourceDuplicate 270 } 271 272 if err := w.validateCheckUntil(checkUntil); err != nil { 273 return err 274 } 275 276 ctx, cfunc := context.WithDeadline(w.ctx, checkUntil) 277 rs := &res{ 278 res: r, 279 checkUntil: checkUntil, 280 state: atomic.Uint32{}, 281 cfunc: cfunc, 282 cb: cb, 283 votes: map[string]struct{}{}, 284 } 285 rs.state.Store(notValidated) 286 287 w.resources[id] = rs 288 289 // if we are a validator, we just start the routine. 290 // so we can ensure the resources exists 291 if w.top.IsValidator() { 292 go w.start(ctx, rs, &initialDelay) 293 } else { 294 // if not a validator, we just jump to the state voteSent 295 // and will wait for all validator to approve basically. 296 // check succeeded 297 rs.state.Store(voteSent) 298 } 299 return nil 300 } 301 302 func (w *Witness) validateCheckUntil(checkUntil time.Time) error { 303 minValid, maxValid := w.now.Add(minValidationPeriod), 304 w.now.Add(maxValidationPeriod) 305 if checkUntil.Unix() < minValid.Unix() || checkUntil.Unix() > maxValid.Unix() { 306 if w.log.GetLevel() <= logging.DebugLevel { 307 w.log.Debug("invalid duration for witness", 308 logging.Time("check-until", checkUntil), 309 logging.Time("min-valid", minValid), 310 logging.Time("max-valid", maxValid), 311 ) 312 } 313 return ErrCheckUntilInvalid 314 } 315 return nil 316 } 317 318 func newBackoff(ctx context.Context, maxElapsedTime time.Duration) backoff.BackOff { 319 bo := backoff.NewExponentialBackOff() 320 bo.MaxElapsedTime = maxElapsedTime 321 bo.InitialInterval = 1 * time.Second 322 return backoff.WithContext(bo, ctx) 323 } 324 325 func (w *Witness) start(ctx context.Context, r *res, initialDelay *int64) { 326 if initialDelay != nil { 327 t := time.NewTimer(time.Duration(*initialDelay) * w.approxBlockTime[r.res.GetChainID()]) 328 <-t.C 329 t.Stop() 330 } 331 332 backff := newBackoff(ctx, r.checkUntil.Sub(w.now)) 333 f := func() error { 334 w.log.Debug("Checking the resource", logging.String("asset-source", r.res.GetID())) 335 336 if err := r.res.Check(ctx); err != nil { 337 w.log.Error("Checking the resource failed", logging.Error(err)) 338 return err 339 } 340 return nil 341 } 342 343 if err := backoff.Retry(f, backff); err != nil { 344 return 345 } 346 347 // check succeeded 348 r.state.Store(validated) 349 } 350 351 func (w *Witness) OnTick(ctx context.Context, t time.Time) { 352 w.now = t 353 isValidator := w.top.IsValidator() 354 355 // sort resources first 356 resourceIDs := make([]string, 0, len(w.resources)) 357 for k := range w.resources { 358 resourceIDs = append(resourceIDs, k) 359 } 360 sort.Strings(resourceIDs) 361 362 // check if any resources passed checks 363 for _, k := range resourceIDs { 364 v := w.resources[k] 365 366 state := v.state.Load() 367 checkPass := v.votePassed(w.top, w.validatorVotesRequired) 368 369 // if the time is expired, or we received enough votes 370 if v.checkUntil.Before(t) || checkPass { 371 // cancel the context so it stops the routine right now 372 v.cfunc() 373 374 if !checkPass { 375 votesReceived := []string{} 376 votesMissing := []string{} 377 votePowers := []string{} 378 for _, k := range w.top.AllVegaPubKeys() { 379 if !w.top.IsTendermintValidator(k) { 380 continue 381 } 382 if _, ok := v.votes[k]; ok { 383 votesReceived = append(votesReceived, k) 384 votePowers = append(votePowers, strconv.FormatInt(w.top.GetVotingPower(k), 10)) 385 continue 386 } 387 votesMissing = append(votesMissing, k) 388 } 389 w.log.Warn("resource checking was not validated by all nodes", 390 logging.String("resource-id", v.res.GetID()), 391 logging.Strings("votes-received", votesReceived), 392 logging.Strings("votes-missing", votesMissing), 393 logging.Strings("votes-power-received", votePowers), 394 logging.Int64("total-voting-power", w.top.GetTotalVotingPower()), 395 ) 396 } 397 398 // callback to the resource holder 399 v.cb(v.res, checkPass) 400 // we delete the resource from our map. 401 delete(w.resources, k) 402 continue 403 } 404 405 // if we are a validator, and the resource was validated 406 // then we try to send our vote. 407 if isValidator && state == validated || w.needResend(k) { 408 v.lastSentVote = t 409 nv := &commandspb.NodeVote{ 410 Reference: v.res.GetID(), 411 Type: v.res.GetType(), 412 } 413 w.cmd.Command(ctx, txn.NodeVoteCommand, nv, w.onCommandSent(k), nil) 414 // set new state so we do not try to validate again 415 v.state.Store(voteSent) 416 } else if (isValidator && state == voteSent) && t.After(v.lastSentVote.Add(w.cfg.NodeVoteResendInterval.Duration)) { 417 if v.selfVoteReceived(w.top.SelfVegaPubKey()) { 418 continue 419 } 420 w.onCommandSent(v.res.GetID())("", fmt.Errorf("no self votes received after %s", w.cfg.NodeVoteResendInterval.Duration.String())) 421 } 422 } 423 } 424 425 func (w *Witness) needResend(res string) bool { 426 w.needResendMu.Lock() 427 defer w.needResendMu.Unlock() 428 if _, ok := w.needResendRes[res]; ok { 429 delete(w.needResendRes, res) 430 return true 431 } 432 return false 433 } 434 435 func (w *Witness) onCommandSent(res string) func(string, error) { 436 return func(_ string, err error) { 437 if err != nil { 438 w.log.Error("could not send command", logging.String("res-id", res), logging.Error(err)) 439 w.needResendMu.Lock() 440 defer w.needResendMu.Unlock() 441 w.needResendRes[res] = struct{}{} 442 } 443 } 444 }