github.com/juliankolbe/go-ethereum@v1.9.992/les/vflux/client/serverpool.go (about) 1 // Copyright 2020 The go-ethereum Authors 2 // This file is part of the go-ethereum library. 3 // 4 // The go-ethereum library is free software: you can redistribute it and/or modify 5 // it under the terms of the GNU Lesser General Public License as published by 6 // the Free Software Foundation, either version 3 of the License, or 7 // (at your option) any later version. 8 // 9 // The go-ethereum library is distributed in the hope that it will be useful, 10 // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 // GNU Lesser General Public License for more details. 13 // 14 // You should have received a copy of the GNU Lesser General Public License 15 // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>. 16 17 package client 18 19 import ( 20 "errors" 21 "math/rand" 22 "reflect" 23 "sync" 24 "sync/atomic" 25 "time" 26 27 "github.com/juliankolbe/go-ethereum/common/mclock" 28 "github.com/juliankolbe/go-ethereum/ethdb" 29 "github.com/juliankolbe/go-ethereum/les/utils" 30 "github.com/juliankolbe/go-ethereum/log" 31 "github.com/juliankolbe/go-ethereum/metrics" 32 "github.com/juliankolbe/go-ethereum/p2p/enode" 33 "github.com/juliankolbe/go-ethereum/p2p/enr" 34 "github.com/juliankolbe/go-ethereum/p2p/nodestate" 35 "github.com/juliankolbe/go-ethereum/rlp" 36 ) 37 38 const ( 39 minTimeout = time.Millisecond * 500 // minimum request timeout suggested by the server pool 40 timeoutRefresh = time.Second * 5 // recalculate timeout if older than this 41 dialCost = 10000 // cost of a TCP dial (used for known node selection weight calculation) 42 dialWaitStep = 1.5 // exponential multiplier of redial wait time when no value was provided by the server 43 queryCost = 500 // cost of a UDP pre-negotiation query 44 queryWaitStep = 1.02 // exponential multiplier of redial wait time when no value was provided by the server 45 waitThreshold = time.Hour * 2000 // drop node if waiting time is over the threshold 46 nodeWeightMul = 1000000 // multiplier constant for node weight calculation 47 nodeWeightThreshold = 100 // minimum weight for keeping a node in the the known (valuable) set 48 minRedialWait = 10 // minimum redial wait time in seconds 49 preNegLimit = 5 // maximum number of simultaneous pre-negotiation queries 50 maxQueryFails = 100 // number of consecutive UDP query failures before we print a warning 51 ) 52 53 // ServerPool provides a node iterator for dial candidates. The output is a mix of newly discovered 54 // nodes, a weighted random selection of known (previously valuable) nodes and trusted/paid nodes. 55 type ServerPool struct { 56 clock mclock.Clock 57 unixTime func() int64 58 db ethdb.KeyValueStore 59 60 ns *nodestate.NodeStateMachine 61 vt *ValueTracker 62 mixer *enode.FairMix 63 mixSources []enode.Iterator 64 dialIterator enode.Iterator 65 validSchemes enr.IdentityScheme 66 trustedURLs []string 67 fillSet *FillSet 68 started, queryFails uint32 69 70 timeoutLock sync.RWMutex 71 timeout time.Duration 72 timeWeights ResponseTimeWeights 73 timeoutRefreshed mclock.AbsTime 74 75 suggestedTimeoutGauge, totalValueGauge metrics.Gauge 76 sessionValueMeter metrics.Meter 77 } 78 79 // nodeHistory keeps track of dial costs which determine node weight together with the 80 // service value calculated by ValueTracker. 81 type nodeHistory struct { 82 dialCost utils.ExpiredValue 83 redialWaitStart, redialWaitEnd int64 // unix time (seconds) 84 } 85 86 type nodeHistoryEnc struct { 87 DialCost utils.ExpiredValue 88 RedialWaitStart, RedialWaitEnd uint64 89 } 90 91 // queryFunc sends a pre-negotiation query and blocks until a response arrives or timeout occurs. 92 // It returns 1 if the remote node has confirmed that connection is possible, 0 if not 93 // possible and -1 if no response arrived (timeout). 94 type queryFunc func(*enode.Node) int 95 96 var ( 97 clientSetup = &nodestate.Setup{Version: 1} 98 sfHasValue = clientSetup.NewPersistentFlag("hasValue") 99 sfQueried = clientSetup.NewFlag("queried") 100 sfCanDial = clientSetup.NewFlag("canDial") 101 sfDialing = clientSetup.NewFlag("dialed") 102 sfWaitDialTimeout = clientSetup.NewFlag("dialTimeout") 103 sfConnected = clientSetup.NewFlag("connected") 104 sfRedialWait = clientSetup.NewFlag("redialWait") 105 sfAlwaysConnect = clientSetup.NewFlag("alwaysConnect") 106 sfDisableSelection = nodestate.MergeFlags(sfQueried, sfCanDial, sfDialing, sfConnected, sfRedialWait) 107 108 sfiNodeHistory = clientSetup.NewPersistentField("nodeHistory", reflect.TypeOf(nodeHistory{}), 109 func(field interface{}) ([]byte, error) { 110 if n, ok := field.(nodeHistory); ok { 111 ne := nodeHistoryEnc{ 112 DialCost: n.dialCost, 113 RedialWaitStart: uint64(n.redialWaitStart), 114 RedialWaitEnd: uint64(n.redialWaitEnd), 115 } 116 enc, err := rlp.EncodeToBytes(&ne) 117 return enc, err 118 } 119 return nil, errors.New("invalid field type") 120 }, 121 func(enc []byte) (interface{}, error) { 122 var ne nodeHistoryEnc 123 err := rlp.DecodeBytes(enc, &ne) 124 n := nodeHistory{ 125 dialCost: ne.DialCost, 126 redialWaitStart: int64(ne.RedialWaitStart), 127 redialWaitEnd: int64(ne.RedialWaitEnd), 128 } 129 return n, err 130 }, 131 ) 132 sfiNodeWeight = clientSetup.NewField("nodeWeight", reflect.TypeOf(uint64(0))) 133 sfiConnectedStats = clientSetup.NewField("connectedStats", reflect.TypeOf(ResponseTimeStats{})) 134 ) 135 136 // newServerPool creates a new server pool 137 func NewServerPool(db ethdb.KeyValueStore, dbKey []byte, mixTimeout time.Duration, query queryFunc, clock mclock.Clock, trustedURLs []string, requestList []RequestInfo) (*ServerPool, enode.Iterator) { 138 s := &ServerPool{ 139 db: db, 140 clock: clock, 141 unixTime: func() int64 { return time.Now().Unix() }, 142 validSchemes: enode.ValidSchemes, 143 trustedURLs: trustedURLs, 144 vt: NewValueTracker(db, &mclock.System{}, requestList, time.Minute, 1/float64(time.Hour), 1/float64(time.Hour*100), 1/float64(time.Hour*1000)), 145 ns: nodestate.NewNodeStateMachine(db, []byte(string(dbKey)+"ns:"), clock, clientSetup), 146 } 147 s.recalTimeout() 148 s.mixer = enode.NewFairMix(mixTimeout) 149 knownSelector := NewWrsIterator(s.ns, sfHasValue, sfDisableSelection, sfiNodeWeight) 150 alwaysConnect := NewQueueIterator(s.ns, sfAlwaysConnect, sfDisableSelection, true, nil) 151 s.mixSources = append(s.mixSources, knownSelector) 152 s.mixSources = append(s.mixSources, alwaysConnect) 153 154 iter := enode.Iterator(s.mixer) 155 if query != nil { 156 iter = s.addPreNegFilter(iter, query) 157 } 158 s.dialIterator = enode.Filter(iter, func(node *enode.Node) bool { 159 s.ns.SetState(node, sfDialing, sfCanDial, 0) 160 s.ns.SetState(node, sfWaitDialTimeout, nodestate.Flags{}, time.Second*10) 161 return true 162 }) 163 164 s.ns.SubscribeState(nodestate.MergeFlags(sfWaitDialTimeout, sfConnected), func(n *enode.Node, oldState, newState nodestate.Flags) { 165 if oldState.Equals(sfWaitDialTimeout) && newState.IsEmpty() { 166 // dial timeout, no connection 167 s.setRedialWait(n, dialCost, dialWaitStep) 168 s.ns.SetStateSub(n, nodestate.Flags{}, sfDialing, 0) 169 } 170 }) 171 172 return s, s.dialIterator 173 } 174 175 // AddMetrics adds metrics to the server pool. Should be called before Start(). 176 func (s *ServerPool) AddMetrics( 177 suggestedTimeoutGauge, totalValueGauge, serverSelectableGauge, serverConnectedGauge metrics.Gauge, 178 sessionValueMeter, serverDialedMeter metrics.Meter) { 179 180 s.suggestedTimeoutGauge = suggestedTimeoutGauge 181 s.totalValueGauge = totalValueGauge 182 s.sessionValueMeter = sessionValueMeter 183 if serverSelectableGauge != nil { 184 s.ns.AddLogMetrics(sfHasValue, sfDisableSelection, "selectable", nil, nil, serverSelectableGauge) 185 } 186 if serverDialedMeter != nil { 187 s.ns.AddLogMetrics(sfDialing, nodestate.Flags{}, "dialed", serverDialedMeter, nil, nil) 188 } 189 if serverConnectedGauge != nil { 190 s.ns.AddLogMetrics(sfConnected, nodestate.Flags{}, "connected", nil, nil, serverConnectedGauge) 191 } 192 } 193 194 // AddSource adds a node discovery source to the server pool (should be called before start) 195 func (s *ServerPool) AddSource(source enode.Iterator) { 196 if source != nil { 197 s.mixSources = append(s.mixSources, source) 198 } 199 } 200 201 // addPreNegFilter installs a node filter mechanism that performs a pre-negotiation query. 202 // Nodes that are filtered out and does not appear on the output iterator are put back 203 // into redialWait state. 204 func (s *ServerPool) addPreNegFilter(input enode.Iterator, query queryFunc) enode.Iterator { 205 s.fillSet = NewFillSet(s.ns, input, sfQueried) 206 s.ns.SubscribeState(sfQueried, func(n *enode.Node, oldState, newState nodestate.Flags) { 207 if newState.Equals(sfQueried) { 208 fails := atomic.LoadUint32(&s.queryFails) 209 if fails == maxQueryFails { 210 log.Warn("UDP pre-negotiation query does not seem to work") 211 } 212 if fails > maxQueryFails { 213 fails = maxQueryFails 214 } 215 if rand.Intn(maxQueryFails*2) < int(fails) { 216 // skip pre-negotiation with increasing chance, max 50% 217 // this ensures that the client can operate even if UDP is not working at all 218 s.ns.SetStateSub(n, sfCanDial, nodestate.Flags{}, time.Second*10) 219 // set canDial before resetting queried so that FillSet will not read more 220 // candidates unnecessarily 221 s.ns.SetStateSub(n, nodestate.Flags{}, sfQueried, 0) 222 return 223 } 224 go func() { 225 q := query(n) 226 if q == -1 { 227 atomic.AddUint32(&s.queryFails, 1) 228 } else { 229 atomic.StoreUint32(&s.queryFails, 0) 230 } 231 s.ns.Operation(func() { 232 // we are no longer running in the operation that the callback belongs to, start a new one because of setRedialWait 233 if q == 1 { 234 s.ns.SetStateSub(n, sfCanDial, nodestate.Flags{}, time.Second*10) 235 } else { 236 s.setRedialWait(n, queryCost, queryWaitStep) 237 } 238 s.ns.SetStateSub(n, nodestate.Flags{}, sfQueried, 0) 239 }) 240 }() 241 } 242 }) 243 return NewQueueIterator(s.ns, sfCanDial, nodestate.Flags{}, false, func(waiting bool) { 244 if waiting { 245 s.fillSet.SetTarget(preNegLimit) 246 } else { 247 s.fillSet.SetTarget(0) 248 } 249 }) 250 } 251 252 // start starts the server pool. Note that NodeStateMachine should be started first. 253 func (s *ServerPool) Start() { 254 s.ns.Start() 255 for _, iter := range s.mixSources { 256 // add sources to mixer at startup because the mixer instantly tries to read them 257 // which should only happen after NodeStateMachine has been started 258 s.mixer.AddSource(iter) 259 } 260 for _, url := range s.trustedURLs { 261 if node, err := enode.Parse(s.validSchemes, url); err == nil { 262 s.ns.SetState(node, sfAlwaysConnect, nodestate.Flags{}, 0) 263 } else { 264 log.Error("Invalid trusted server URL", "url", url, "error", err) 265 } 266 } 267 unixTime := s.unixTime() 268 s.ns.Operation(func() { 269 s.ns.ForEach(sfHasValue, nodestate.Flags{}, func(node *enode.Node, state nodestate.Flags) { 270 s.calculateWeight(node) 271 if n, ok := s.ns.GetField(node, sfiNodeHistory).(nodeHistory); ok && n.redialWaitEnd > unixTime { 272 wait := n.redialWaitEnd - unixTime 273 lastWait := n.redialWaitEnd - n.redialWaitStart 274 if wait > lastWait { 275 // if the time until expiration is larger than the last suggested 276 // waiting time then the system clock was probably adjusted 277 wait = lastWait 278 } 279 s.ns.SetStateSub(node, sfRedialWait, nodestate.Flags{}, time.Duration(wait)*time.Second) 280 } 281 }) 282 }) 283 atomic.StoreUint32(&s.started, 1) 284 } 285 286 // stop stops the server pool 287 func (s *ServerPool) Stop() { 288 s.dialIterator.Close() 289 if s.fillSet != nil { 290 s.fillSet.Close() 291 } 292 s.ns.Operation(func() { 293 s.ns.ForEach(sfConnected, nodestate.Flags{}, func(n *enode.Node, state nodestate.Flags) { 294 // recalculate weight of connected nodes in order to update hasValue flag if necessary 295 s.calculateWeight(n) 296 }) 297 }) 298 s.ns.Stop() 299 s.vt.Stop() 300 } 301 302 // registerPeer implements serverPeerSubscriber 303 func (s *ServerPool) RegisterNode(node *enode.Node) (*NodeValueTracker, error) { 304 if atomic.LoadUint32(&s.started) == 0 { 305 return nil, errors.New("server pool not started yet") 306 } 307 s.ns.SetState(node, sfConnected, sfDialing.Or(sfWaitDialTimeout), 0) 308 nvt := s.vt.Register(node.ID()) 309 s.ns.SetField(node, sfiConnectedStats, nvt.RtStats()) 310 return nvt, nil 311 } 312 313 // unregisterPeer implements serverPeerSubscriber 314 func (s *ServerPool) UnregisterNode(node *enode.Node) { 315 s.ns.Operation(func() { 316 s.setRedialWait(node, dialCost, dialWaitStep) 317 s.ns.SetStateSub(node, nodestate.Flags{}, sfConnected, 0) 318 s.ns.SetFieldSub(node, sfiConnectedStats, nil) 319 }) 320 s.vt.Unregister(node.ID()) 321 } 322 323 // recalTimeout calculates the current recommended timeout. This value is used by 324 // the client as a "soft timeout" value. It also affects the service value calculation 325 // of individual nodes. 326 func (s *ServerPool) recalTimeout() { 327 // Use cached result if possible, avoid recalculating too frequently. 328 s.timeoutLock.RLock() 329 refreshed := s.timeoutRefreshed 330 s.timeoutLock.RUnlock() 331 now := s.clock.Now() 332 if refreshed != 0 && time.Duration(now-refreshed) < timeoutRefresh { 333 return 334 } 335 // Cached result is stale, recalculate a new one. 336 rts := s.vt.RtStats() 337 338 // Add a fake statistic here. It is an easy way to initialize with some 339 // conservative values when the database is new. As soon as we have a 340 // considerable amount of real stats this small value won't matter. 341 rts.Add(time.Second*2, 10, s.vt.StatsExpFactor()) 342 343 // Use either 10% failure rate timeout or twice the median response time 344 // as the recommended timeout. 345 timeout := minTimeout 346 if t := rts.Timeout(0.1); t > timeout { 347 timeout = t 348 } 349 if t := rts.Timeout(0.5) * 2; t > timeout { 350 timeout = t 351 } 352 s.timeoutLock.Lock() 353 if s.timeout != timeout { 354 s.timeout = timeout 355 s.timeWeights = TimeoutWeights(s.timeout) 356 357 if s.suggestedTimeoutGauge != nil { 358 s.suggestedTimeoutGauge.Update(int64(s.timeout / time.Millisecond)) 359 } 360 if s.totalValueGauge != nil { 361 s.totalValueGauge.Update(int64(rts.Value(s.timeWeights, s.vt.StatsExpFactor()))) 362 } 363 } 364 s.timeoutRefreshed = now 365 s.timeoutLock.Unlock() 366 } 367 368 // GetTimeout returns the recommended request timeout. 369 func (s *ServerPool) GetTimeout() time.Duration { 370 s.recalTimeout() 371 s.timeoutLock.RLock() 372 defer s.timeoutLock.RUnlock() 373 return s.timeout 374 } 375 376 // getTimeoutAndWeight returns the recommended request timeout as well as the 377 // response time weight which is necessary to calculate service value. 378 func (s *ServerPool) getTimeoutAndWeight() (time.Duration, ResponseTimeWeights) { 379 s.recalTimeout() 380 s.timeoutLock.RLock() 381 defer s.timeoutLock.RUnlock() 382 return s.timeout, s.timeWeights 383 } 384 385 // addDialCost adds the given amount of dial cost to the node history and returns the current 386 // amount of total dial cost 387 func (s *ServerPool) addDialCost(n *nodeHistory, amount int64) uint64 { 388 logOffset := s.vt.StatsExpirer().LogOffset(s.clock.Now()) 389 if amount > 0 { 390 n.dialCost.Add(amount, logOffset) 391 } 392 totalDialCost := n.dialCost.Value(logOffset) 393 if totalDialCost < dialCost { 394 totalDialCost = dialCost 395 } 396 return totalDialCost 397 } 398 399 // serviceValue returns the service value accumulated in this session and in total 400 func (s *ServerPool) serviceValue(node *enode.Node) (sessionValue, totalValue float64) { 401 nvt := s.vt.GetNode(node.ID()) 402 if nvt == nil { 403 return 0, 0 404 } 405 currentStats := nvt.RtStats() 406 _, timeWeights := s.getTimeoutAndWeight() 407 expFactor := s.vt.StatsExpFactor() 408 409 totalValue = currentStats.Value(timeWeights, expFactor) 410 if connStats, ok := s.ns.GetField(node, sfiConnectedStats).(ResponseTimeStats); ok { 411 diff := currentStats 412 diff.SubStats(&connStats) 413 sessionValue = diff.Value(timeWeights, expFactor) 414 if s.sessionValueMeter != nil { 415 s.sessionValueMeter.Mark(int64(sessionValue)) 416 } 417 } 418 return 419 } 420 421 // updateWeight calculates the node weight and updates the nodeWeight field and the 422 // hasValue flag. It also saves the node state if necessary. 423 // Note: this function should run inside a NodeStateMachine operation 424 func (s *ServerPool) updateWeight(node *enode.Node, totalValue float64, totalDialCost uint64) { 425 weight := uint64(totalValue * nodeWeightMul / float64(totalDialCost)) 426 if weight >= nodeWeightThreshold { 427 s.ns.SetStateSub(node, sfHasValue, nodestate.Flags{}, 0) 428 s.ns.SetFieldSub(node, sfiNodeWeight, weight) 429 } else { 430 s.ns.SetStateSub(node, nodestate.Flags{}, sfHasValue, 0) 431 s.ns.SetFieldSub(node, sfiNodeWeight, nil) 432 s.ns.SetFieldSub(node, sfiNodeHistory, nil) 433 } 434 s.ns.Persist(node) // saved if node history or hasValue changed 435 } 436 437 // setRedialWait calculates and sets the redialWait timeout based on the service value 438 // and dial cost accumulated during the last session/attempt and in total. 439 // The waiting time is raised exponentially if no service value has been received in order 440 // to prevent dialing an unresponsive node frequently for a very long time just because it 441 // was useful in the past. It can still be occasionally dialed though and once it provides 442 // a significant amount of service value again its waiting time is quickly reduced or reset 443 // to the minimum. 444 // Note: node weight is also recalculated and updated by this function. 445 // Note 2: this function should run inside a NodeStateMachine operation 446 func (s *ServerPool) setRedialWait(node *enode.Node, addDialCost int64, waitStep float64) { 447 n, _ := s.ns.GetField(node, sfiNodeHistory).(nodeHistory) 448 sessionValue, totalValue := s.serviceValue(node) 449 totalDialCost := s.addDialCost(&n, addDialCost) 450 451 // if the current dial session has yielded at least the average value/dial cost ratio 452 // then the waiting time should be reset to the minimum. If the session value 453 // is below average but still positive then timeout is limited to the ratio of 454 // average / current service value multiplied by the minimum timeout. If the attempt 455 // was unsuccessful then timeout is raised exponentially without limitation. 456 // Note: dialCost is used in the formula below even if dial was not attempted at all 457 // because the pre-negotiation query did not return a positive result. In this case 458 // the ratio has no meaning anyway and waitFactor is always raised, though in smaller 459 // steps because queries are cheaper and therefore we can allow more failed attempts. 460 unixTime := s.unixTime() 461 plannedTimeout := float64(n.redialWaitEnd - n.redialWaitStart) // last planned redialWait timeout 462 var actualWait float64 // actual waiting time elapsed 463 if unixTime > n.redialWaitEnd { 464 // the planned timeout has elapsed 465 actualWait = plannedTimeout 466 } else { 467 // if the node was redialed earlier then we do not raise the planned timeout 468 // exponentially because that could lead to the timeout rising very high in 469 // a short amount of time 470 // Note that in case of an early redial actualWait also includes the dial 471 // timeout or connection time of the last attempt but it still serves its 472 // purpose of preventing the timeout rising quicker than linearly as a function 473 // of total time elapsed without a successful connection. 474 actualWait = float64(unixTime - n.redialWaitStart) 475 } 476 // raise timeout exponentially if the last planned timeout has elapsed 477 // (use at least the last planned timeout otherwise) 478 nextTimeout := actualWait * waitStep 479 if plannedTimeout > nextTimeout { 480 nextTimeout = plannedTimeout 481 } 482 // we reduce the waiting time if the server has provided service value during the 483 // connection (but never under the minimum) 484 a := totalValue * dialCost * float64(minRedialWait) 485 b := float64(totalDialCost) * sessionValue 486 if a < b*nextTimeout { 487 nextTimeout = a / b 488 } 489 if nextTimeout < minRedialWait { 490 nextTimeout = minRedialWait 491 } 492 wait := time.Duration(float64(time.Second) * nextTimeout) 493 if wait < waitThreshold { 494 n.redialWaitStart = unixTime 495 n.redialWaitEnd = unixTime + int64(nextTimeout) 496 s.ns.SetFieldSub(node, sfiNodeHistory, n) 497 s.ns.SetStateSub(node, sfRedialWait, nodestate.Flags{}, wait) 498 s.updateWeight(node, totalValue, totalDialCost) 499 } else { 500 // discard known node statistics if waiting time is very long because the node 501 // hasn't been responsive for a very long time 502 s.ns.SetFieldSub(node, sfiNodeHistory, nil) 503 s.ns.SetFieldSub(node, sfiNodeWeight, nil) 504 s.ns.SetStateSub(node, nodestate.Flags{}, sfHasValue, 0) 505 } 506 } 507 508 // calculateWeight calculates and sets the node weight without altering the node history. 509 // This function should be called during startup and shutdown only, otherwise setRedialWait 510 // will keep the weights updated as the underlying statistics are adjusted. 511 // Note: this function should run inside a NodeStateMachine operation 512 func (s *ServerPool) calculateWeight(node *enode.Node) { 513 n, _ := s.ns.GetField(node, sfiNodeHistory).(nodeHistory) 514 _, totalValue := s.serviceValue(node) 515 totalDialCost := s.addDialCost(&n, 0) 516 s.updateWeight(node, totalValue, totalDialCost) 517 } 518 519 // API returns the vflux client API 520 func (s *ServerPool) API() *PrivateClientAPI { 521 return NewPrivateClientAPI(s.vt) 522 }