github.com/cryptogateway/go-paymex@v0.0.0-20210204174735-96277fb1e602/les/serverpool.go (about) 1 // Copyright 2020 The go-ethereum Authors 2 // This file is part of the go-ethereum library. 3 // 4 // The go-ethereum library is free software: you can redistribute it and/or modify 5 // it under the terms of the GNU Lesser General Public License as published by 6 // the Free Software Foundation, either version 3 of the License, or 7 // (at your option) any later version. 8 // 9 // The go-ethereum library is distributed in the hope that it will be useful, 10 // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 // GNU Lesser General Public License for more details. 13 // 14 // You should have received a copy of the GNU Lesser General Public License 15 // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>. 16 17 package les 18 19 import ( 20 "errors" 21 "math/rand" 22 "reflect" 23 "sync" 24 "sync/atomic" 25 "time" 26 27 "github.com/cryptogateway/go-paymex/common/mclock" 28 "github.com/cryptogateway/go-paymex/ethdb" 29 lpc "github.com/cryptogateway/go-paymex/les/lespay/client" 30 "github.com/cryptogateway/go-paymex/les/utils" 31 "github.com/cryptogateway/go-paymex/log" 32 "github.com/cryptogateway/go-paymex/p2p/enode" 33 "github.com/cryptogateway/go-paymex/p2p/enr" 34 "github.com/cryptogateway/go-paymex/p2p/nodestate" 35 "github.com/cryptogateway/go-paymex/rlp" 36 ) 37 38 const ( 39 minTimeout = time.Millisecond * 500 // minimum request timeout suggested by the server pool 40 timeoutRefresh = time.Second * 5 // recalculate timeout if older than this 41 dialCost = 10000 // cost of a TCP dial (used for known node selection weight calculation) 42 dialWaitStep = 1.5 // exponential multiplier of redial wait time when no value was provided by the server 43 queryCost = 500 // cost of a UDP pre-negotiation query 44 queryWaitStep = 1.02 // exponential multiplier of redial wait time when no value was provided by the server 45 waitThreshold = time.Hour * 2000 // drop node if waiting time is over the threshold 46 nodeWeightMul = 1000000 // multiplier constant for node weight calculation 47 nodeWeightThreshold = 100 // minimum weight for keeping a node in the the known (valuable) set 48 minRedialWait = 10 // minimum redial wait time in seconds 49 preNegLimit = 5 // maximum number of simultaneous pre-negotiation queries 50 maxQueryFails = 100 // number of consecutive UDP query failures before we print a warning 51 ) 52 53 // serverPool provides a node iterator for dial candidates. The output is a mix of newly discovered 54 // nodes, a weighted random selection of known (previously valuable) nodes and trusted/paid nodes. 55 type serverPool struct { 56 clock mclock.Clock 57 unixTime func() int64 58 db ethdb.KeyValueStore 59 60 ns *nodestate.NodeStateMachine 61 vt *lpc.ValueTracker 62 mixer *enode.FairMix 63 mixSources []enode.Iterator 64 dialIterator enode.Iterator 65 validSchemes enr.IdentityScheme 66 trustedURLs []string 67 fillSet *lpc.FillSet 68 queryFails uint32 69 70 timeoutLock sync.RWMutex 71 timeout time.Duration 72 timeWeights lpc.ResponseTimeWeights 73 timeoutRefreshed mclock.AbsTime 74 } 75 76 // nodeHistory keeps track of dial costs which determine node weight together with the 77 // service value calculated by lpc.ValueTracker. 78 type nodeHistory struct { 79 dialCost utils.ExpiredValue 80 redialWaitStart, redialWaitEnd int64 // unix time (seconds) 81 } 82 83 type nodeHistoryEnc struct { 84 DialCost utils.ExpiredValue 85 RedialWaitStart, RedialWaitEnd uint64 86 } 87 88 // queryFunc sends a pre-negotiation query and blocks until a response arrives or timeout occurs. 89 // It returns 1 if the remote node has confirmed that connection is possible, 0 if not 90 // possible and -1 if no response arrived (timeout). 91 type queryFunc func(*enode.Node) int 92 93 var ( 94 serverPoolSetup = &nodestate.Setup{Version: 1} 95 sfHasValue = serverPoolSetup.NewPersistentFlag("hasValue") 96 sfQueried = serverPoolSetup.NewFlag("queried") 97 sfCanDial = serverPoolSetup.NewFlag("canDial") 98 sfDialing = serverPoolSetup.NewFlag("dialed") 99 sfWaitDialTimeout = serverPoolSetup.NewFlag("dialTimeout") 100 sfConnected = serverPoolSetup.NewFlag("connected") 101 sfRedialWait = serverPoolSetup.NewFlag("redialWait") 102 sfAlwaysConnect = serverPoolSetup.NewFlag("alwaysConnect") 103 sfDisableSelection = nodestate.MergeFlags(sfQueried, sfCanDial, sfDialing, sfConnected, sfRedialWait) 104 105 sfiNodeHistory = serverPoolSetup.NewPersistentField("nodeHistory", reflect.TypeOf(nodeHistory{}), 106 func(field interface{}) ([]byte, error) { 107 if n, ok := field.(nodeHistory); ok { 108 ne := nodeHistoryEnc{ 109 DialCost: n.dialCost, 110 RedialWaitStart: uint64(n.redialWaitStart), 111 RedialWaitEnd: uint64(n.redialWaitEnd), 112 } 113 enc, err := rlp.EncodeToBytes(&ne) 114 return enc, err 115 } 116 return nil, errors.New("invalid field type") 117 }, 118 func(enc []byte) (interface{}, error) { 119 var ne nodeHistoryEnc 120 err := rlp.DecodeBytes(enc, &ne) 121 n := nodeHistory{ 122 dialCost: ne.DialCost, 123 redialWaitStart: int64(ne.RedialWaitStart), 124 redialWaitEnd: int64(ne.RedialWaitEnd), 125 } 126 return n, err 127 }, 128 ) 129 sfiNodeWeight = serverPoolSetup.NewField("nodeWeight", reflect.TypeOf(uint64(0))) 130 sfiConnectedStats = serverPoolSetup.NewField("connectedStats", reflect.TypeOf(lpc.ResponseTimeStats{})) 131 ) 132 133 // newServerPool creates a new server pool 134 func newServerPool(db ethdb.KeyValueStore, dbKey []byte, vt *lpc.ValueTracker, mixTimeout time.Duration, query queryFunc, clock mclock.Clock, trustedURLs []string) *serverPool { 135 s := &serverPool{ 136 db: db, 137 clock: clock, 138 unixTime: func() int64 { return time.Now().Unix() }, 139 validSchemes: enode.ValidSchemes, 140 trustedURLs: trustedURLs, 141 vt: vt, 142 ns: nodestate.NewNodeStateMachine(db, []byte(string(dbKey)+"ns:"), clock, serverPoolSetup), 143 } 144 s.recalTimeout() 145 s.mixer = enode.NewFairMix(mixTimeout) 146 knownSelector := lpc.NewWrsIterator(s.ns, sfHasValue, sfDisableSelection, sfiNodeWeight) 147 alwaysConnect := lpc.NewQueueIterator(s.ns, sfAlwaysConnect, sfDisableSelection, true, nil) 148 s.mixSources = append(s.mixSources, knownSelector) 149 s.mixSources = append(s.mixSources, alwaysConnect) 150 151 iter := enode.Iterator(s.mixer) 152 if query != nil { 153 iter = s.addPreNegFilter(iter, query) 154 } 155 s.dialIterator = enode.Filter(iter, func(node *enode.Node) bool { 156 s.ns.SetState(node, sfDialing, sfCanDial, 0) 157 s.ns.SetState(node, sfWaitDialTimeout, nodestate.Flags{}, time.Second*10) 158 return true 159 }) 160 161 s.ns.SubscribeState(nodestate.MergeFlags(sfWaitDialTimeout, sfConnected), func(n *enode.Node, oldState, newState nodestate.Flags) { 162 if oldState.Equals(sfWaitDialTimeout) && newState.IsEmpty() { 163 // dial timeout, no connection 164 s.setRedialWait(n, dialCost, dialWaitStep) 165 s.ns.SetStateSub(n, nodestate.Flags{}, sfDialing, 0) 166 } 167 }) 168 169 s.ns.AddLogMetrics(sfHasValue, sfDisableSelection, "selectable", nil, nil, serverSelectableGauge) 170 s.ns.AddLogMetrics(sfDialing, nodestate.Flags{}, "dialed", serverDialedMeter, nil, nil) 171 s.ns.AddLogMetrics(sfConnected, nodestate.Flags{}, "connected", nil, nil, serverConnectedGauge) 172 return s 173 } 174 175 // addSource adds a node discovery source to the server pool (should be called before start) 176 func (s *serverPool) addSource(source enode.Iterator) { 177 if source != nil { 178 s.mixSources = append(s.mixSources, source) 179 } 180 } 181 182 // addPreNegFilter installs a node filter mechanism that performs a pre-negotiation query. 183 // Nodes that are filtered out and does not appear on the output iterator are put back 184 // into redialWait state. 185 func (s *serverPool) addPreNegFilter(input enode.Iterator, query queryFunc) enode.Iterator { 186 s.fillSet = lpc.NewFillSet(s.ns, input, sfQueried) 187 s.ns.SubscribeState(sfQueried, func(n *enode.Node, oldState, newState nodestate.Flags) { 188 if newState.Equals(sfQueried) { 189 fails := atomic.LoadUint32(&s.queryFails) 190 if fails == maxQueryFails { 191 log.Warn("UDP pre-negotiation query does not seem to work") 192 } 193 if fails > maxQueryFails { 194 fails = maxQueryFails 195 } 196 if rand.Intn(maxQueryFails*2) < int(fails) { 197 // skip pre-negotiation with increasing chance, max 50% 198 // this ensures that the client can operate even if UDP is not working at all 199 s.ns.SetStateSub(n, sfCanDial, nodestate.Flags{}, time.Second*10) 200 // set canDial before resetting queried so that FillSet will not read more 201 // candidates unnecessarily 202 s.ns.SetStateSub(n, nodestate.Flags{}, sfQueried, 0) 203 return 204 } 205 go func() { 206 q := query(n) 207 if q == -1 { 208 atomic.AddUint32(&s.queryFails, 1) 209 } else { 210 atomic.StoreUint32(&s.queryFails, 0) 211 } 212 s.ns.Operation(func() { 213 // we are no longer running in the operation that the callback belongs to, start a new one because of setRedialWait 214 if q == 1 { 215 s.ns.SetStateSub(n, sfCanDial, nodestate.Flags{}, time.Second*10) 216 } else { 217 s.setRedialWait(n, queryCost, queryWaitStep) 218 } 219 s.ns.SetStateSub(n, nodestate.Flags{}, sfQueried, 0) 220 }) 221 }() 222 } 223 }) 224 return lpc.NewQueueIterator(s.ns, sfCanDial, nodestate.Flags{}, false, func(waiting bool) { 225 if waiting { 226 s.fillSet.SetTarget(preNegLimit) 227 } else { 228 s.fillSet.SetTarget(0) 229 } 230 }) 231 } 232 233 // start starts the server pool. Note that NodeStateMachine should be started first. 234 func (s *serverPool) start() { 235 s.ns.Start() 236 for _, iter := range s.mixSources { 237 // add sources to mixer at startup because the mixer instantly tries to read them 238 // which should only happen after NodeStateMachine has been started 239 s.mixer.AddSource(iter) 240 } 241 for _, url := range s.trustedURLs { 242 if node, err := enode.Parse(s.validSchemes, url); err == nil { 243 s.ns.SetState(node, sfAlwaysConnect, nodestate.Flags{}, 0) 244 } else { 245 log.Error("Invalid trusted server URL", "url", url, "error", err) 246 } 247 } 248 unixTime := s.unixTime() 249 s.ns.Operation(func() { 250 s.ns.ForEach(sfHasValue, nodestate.Flags{}, func(node *enode.Node, state nodestate.Flags) { 251 s.calculateWeight(node) 252 if n, ok := s.ns.GetField(node, sfiNodeHistory).(nodeHistory); ok && n.redialWaitEnd > unixTime { 253 wait := n.redialWaitEnd - unixTime 254 lastWait := n.redialWaitEnd - n.redialWaitStart 255 if wait > lastWait { 256 // if the time until expiration is larger than the last suggested 257 // waiting time then the system clock was probably adjusted 258 wait = lastWait 259 } 260 s.ns.SetStateSub(node, sfRedialWait, nodestate.Flags{}, time.Duration(wait)*time.Second) 261 } 262 }) 263 }) 264 } 265 266 // stop stops the server pool 267 func (s *serverPool) stop() { 268 s.dialIterator.Close() 269 if s.fillSet != nil { 270 s.fillSet.Close() 271 } 272 s.ns.Operation(func() { 273 s.ns.ForEach(sfConnected, nodestate.Flags{}, func(n *enode.Node, state nodestate.Flags) { 274 // recalculate weight of connected nodes in order to update hasValue flag if necessary 275 s.calculateWeight(n) 276 }) 277 }) 278 s.ns.Stop() 279 } 280 281 // registerPeer implements serverPeerSubscriber 282 func (s *serverPool) registerPeer(p *serverPeer) { 283 s.ns.SetState(p.Node(), sfConnected, sfDialing.Or(sfWaitDialTimeout), 0) 284 nvt := s.vt.Register(p.ID()) 285 s.ns.SetField(p.Node(), sfiConnectedStats, nvt.RtStats()) 286 p.setValueTracker(s.vt, nvt) 287 p.updateVtParams() 288 } 289 290 // unregisterPeer implements serverPeerSubscriber 291 func (s *serverPool) unregisterPeer(p *serverPeer) { 292 s.ns.Operation(func() { 293 s.setRedialWait(p.Node(), dialCost, dialWaitStep) 294 s.ns.SetStateSub(p.Node(), nodestate.Flags{}, sfConnected, 0) 295 s.ns.SetFieldSub(p.Node(), sfiConnectedStats, nil) 296 }) 297 s.vt.Unregister(p.ID()) 298 p.setValueTracker(nil, nil) 299 } 300 301 // recalTimeout calculates the current recommended timeout. This value is used by 302 // the client as a "soft timeout" value. It also affects the service value calculation 303 // of individual nodes. 304 func (s *serverPool) recalTimeout() { 305 // Use cached result if possible, avoid recalculating too frequently. 306 s.timeoutLock.RLock() 307 refreshed := s.timeoutRefreshed 308 s.timeoutLock.RUnlock() 309 now := s.clock.Now() 310 if refreshed != 0 && time.Duration(now-refreshed) < timeoutRefresh { 311 return 312 } 313 // Cached result is stale, recalculate a new one. 314 rts := s.vt.RtStats() 315 316 // Add a fake statistic here. It is an easy way to initialize with some 317 // conservative values when the database is new. As soon as we have a 318 // considerable amount of real stats this small value won't matter. 319 rts.Add(time.Second*2, 10, s.vt.StatsExpFactor()) 320 321 // Use either 10% failure rate timeout or twice the median response time 322 // as the recommended timeout. 323 timeout := minTimeout 324 if t := rts.Timeout(0.1); t > timeout { 325 timeout = t 326 } 327 if t := rts.Timeout(0.5) * 2; t > timeout { 328 timeout = t 329 } 330 s.timeoutLock.Lock() 331 if s.timeout != timeout { 332 s.timeout = timeout 333 s.timeWeights = lpc.TimeoutWeights(s.timeout) 334 335 suggestedTimeoutGauge.Update(int64(s.timeout / time.Millisecond)) 336 totalValueGauge.Update(int64(rts.Value(s.timeWeights, s.vt.StatsExpFactor()))) 337 } 338 s.timeoutRefreshed = now 339 s.timeoutLock.Unlock() 340 } 341 342 // getTimeout returns the recommended request timeout. 343 func (s *serverPool) getTimeout() time.Duration { 344 s.recalTimeout() 345 s.timeoutLock.RLock() 346 defer s.timeoutLock.RUnlock() 347 return s.timeout 348 } 349 350 // getTimeoutAndWeight returns the recommended request timeout as well as the 351 // response time weight which is necessary to calculate service value. 352 func (s *serverPool) getTimeoutAndWeight() (time.Duration, lpc.ResponseTimeWeights) { 353 s.recalTimeout() 354 s.timeoutLock.RLock() 355 defer s.timeoutLock.RUnlock() 356 return s.timeout, s.timeWeights 357 } 358 359 // addDialCost adds the given amount of dial cost to the node history and returns the current 360 // amount of total dial cost 361 func (s *serverPool) addDialCost(n *nodeHistory, amount int64) uint64 { 362 logOffset := s.vt.StatsExpirer().LogOffset(s.clock.Now()) 363 if amount > 0 { 364 n.dialCost.Add(amount, logOffset) 365 } 366 totalDialCost := n.dialCost.Value(logOffset) 367 if totalDialCost < dialCost { 368 totalDialCost = dialCost 369 } 370 return totalDialCost 371 } 372 373 // serviceValue returns the service value accumulated in this session and in total 374 func (s *serverPool) serviceValue(node *enode.Node) (sessionValue, totalValue float64) { 375 nvt := s.vt.GetNode(node.ID()) 376 if nvt == nil { 377 return 0, 0 378 } 379 currentStats := nvt.RtStats() 380 _, timeWeights := s.getTimeoutAndWeight() 381 expFactor := s.vt.StatsExpFactor() 382 383 totalValue = currentStats.Value(timeWeights, expFactor) 384 if connStats, ok := s.ns.GetField(node, sfiConnectedStats).(lpc.ResponseTimeStats); ok { 385 diff := currentStats 386 diff.SubStats(&connStats) 387 sessionValue = diff.Value(timeWeights, expFactor) 388 sessionValueMeter.Mark(int64(sessionValue)) 389 } 390 return 391 } 392 393 // updateWeight calculates the node weight and updates the nodeWeight field and the 394 // hasValue flag. It also saves the node state if necessary. 395 // Note: this function should run inside a NodeStateMachine operation 396 func (s *serverPool) updateWeight(node *enode.Node, totalValue float64, totalDialCost uint64) { 397 weight := uint64(totalValue * nodeWeightMul / float64(totalDialCost)) 398 if weight >= nodeWeightThreshold { 399 s.ns.SetStateSub(node, sfHasValue, nodestate.Flags{}, 0) 400 s.ns.SetFieldSub(node, sfiNodeWeight, weight) 401 } else { 402 s.ns.SetStateSub(node, nodestate.Flags{}, sfHasValue, 0) 403 s.ns.SetFieldSub(node, sfiNodeWeight, nil) 404 s.ns.SetFieldSub(node, sfiNodeHistory, nil) 405 } 406 s.ns.Persist(node) // saved if node history or hasValue changed 407 } 408 409 // setRedialWait calculates and sets the redialWait timeout based on the service value 410 // and dial cost accumulated during the last session/attempt and in total. 411 // The waiting time is raised exponentially if no service value has been received in order 412 // to prevent dialing an unresponsive node frequently for a very long time just because it 413 // was useful in the past. It can still be occasionally dialed though and once it provides 414 // a significant amount of service value again its waiting time is quickly reduced or reset 415 // to the minimum. 416 // Note: node weight is also recalculated and updated by this function. 417 // Note 2: this function should run inside a NodeStateMachine operation 418 func (s *serverPool) setRedialWait(node *enode.Node, addDialCost int64, waitStep float64) { 419 n, _ := s.ns.GetField(node, sfiNodeHistory).(nodeHistory) 420 sessionValue, totalValue := s.serviceValue(node) 421 totalDialCost := s.addDialCost(&n, addDialCost) 422 423 // if the current dial session has yielded at least the average value/dial cost ratio 424 // then the waiting time should be reset to the minimum. If the session value 425 // is below average but still positive then timeout is limited to the ratio of 426 // average / current service value multiplied by the minimum timeout. If the attempt 427 // was unsuccessful then timeout is raised exponentially without limitation. 428 // Note: dialCost is used in the formula below even if dial was not attempted at all 429 // because the pre-negotiation query did not return a positive result. In this case 430 // the ratio has no meaning anyway and waitFactor is always raised, though in smaller 431 // steps because queries are cheaper and therefore we can allow more failed attempts. 432 unixTime := s.unixTime() 433 plannedTimeout := float64(n.redialWaitEnd - n.redialWaitStart) // last planned redialWait timeout 434 var actualWait float64 // actual waiting time elapsed 435 if unixTime > n.redialWaitEnd { 436 // the planned timeout has elapsed 437 actualWait = plannedTimeout 438 } else { 439 // if the node was redialed earlier then we do not raise the planned timeout 440 // exponentially because that could lead to the timeout rising very high in 441 // a short amount of time 442 // Note that in case of an early redial actualWait also includes the dial 443 // timeout or connection time of the last attempt but it still serves its 444 // purpose of preventing the timeout rising quicker than linearly as a function 445 // of total time elapsed without a successful connection. 446 actualWait = float64(unixTime - n.redialWaitStart) 447 } 448 // raise timeout exponentially if the last planned timeout has elapsed 449 // (use at least the last planned timeout otherwise) 450 nextTimeout := actualWait * waitStep 451 if plannedTimeout > nextTimeout { 452 nextTimeout = plannedTimeout 453 } 454 // we reduce the waiting time if the server has provided service value during the 455 // connection (but never under the minimum) 456 a := totalValue * dialCost * float64(minRedialWait) 457 b := float64(totalDialCost) * sessionValue 458 if a < b*nextTimeout { 459 nextTimeout = a / b 460 } 461 if nextTimeout < minRedialWait { 462 nextTimeout = minRedialWait 463 } 464 wait := time.Duration(float64(time.Second) * nextTimeout) 465 if wait < waitThreshold { 466 n.redialWaitStart = unixTime 467 n.redialWaitEnd = unixTime + int64(nextTimeout) 468 s.ns.SetFieldSub(node, sfiNodeHistory, n) 469 s.ns.SetStateSub(node, sfRedialWait, nodestate.Flags{}, wait) 470 s.updateWeight(node, totalValue, totalDialCost) 471 } else { 472 // discard known node statistics if waiting time is very long because the node 473 // hasn't been responsive for a very long time 474 s.ns.SetFieldSub(node, sfiNodeHistory, nil) 475 s.ns.SetFieldSub(node, sfiNodeWeight, nil) 476 s.ns.SetStateSub(node, nodestate.Flags{}, sfHasValue, 0) 477 } 478 } 479 480 // calculateWeight calculates and sets the node weight without altering the node history. 481 // This function should be called during startup and shutdown only, otherwise setRedialWait 482 // will keep the weights updated as the underlying statistics are adjusted. 483 // Note: this function should run inside a NodeStateMachine operation 484 func (s *serverPool) calculateWeight(node *enode.Node) { 485 n, _ := s.ns.GetField(node, sfiNodeHistory).(nodeHistory) 486 _, totalValue := s.serviceValue(node) 487 totalDialCost := s.addDialCost(&n, 0) 488 s.updateWeight(node, totalValue, totalDialCost) 489 }