github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/network/alsp/manager/manager.go (about) 1 package alspmgr 2 3 import ( 4 crand "crypto/rand" 5 "errors" 6 "fmt" 7 "math" 8 "time" 9 10 "github.com/rs/zerolog" 11 12 "github.com/onflow/flow-go/engine/common/worker" 13 "github.com/onflow/flow-go/model/flow" 14 "github.com/onflow/flow-go/module" 15 "github.com/onflow/flow-go/module/component" 16 "github.com/onflow/flow-go/module/irrecoverable" 17 "github.com/onflow/flow-go/module/mempool/queue" 18 "github.com/onflow/flow-go/module/metrics" 19 "github.com/onflow/flow-go/network" 20 "github.com/onflow/flow-go/network/alsp" 21 "github.com/onflow/flow-go/network/alsp/internal" 22 "github.com/onflow/flow-go/network/alsp/model" 23 "github.com/onflow/flow-go/network/channels" 24 "github.com/onflow/flow-go/utils/logging" 25 ) 26 27 const ( 28 // defaultMisbehaviorReportManagerWorkers is the default number of workers in the worker pool. 29 defaultMisbehaviorReportManagerWorkers = 2 30 ) 31 32 var ( 33 // ErrSpamRecordCacheSizeNotSet is returned when the spam record cache size is not set, it is a fatal irrecoverable error, 34 // and the ALSP module cannot be initialized. 35 ErrSpamRecordCacheSizeNotSet = errors.New("spam record cache size is not set") 36 // ErrSpamReportQueueSizeNotSet is returned when the spam report queue size is not set, it is a fatal irrecoverable error, 37 // and the ALSP module cannot be initialized. 38 ErrSpamReportQueueSizeNotSet = errors.New("spam report queue size is not set") 39 // ErrHeartBeatIntervalNotSet is returned when the heartbeat interval is not set, it is a fatal irrecoverable error, 40 // and the ALSP module cannot be initialized. 41 ErrHeartBeatIntervalNotSet = errors.New("heartbeat interval is not set") 42 ) 43 44 type SpamRecordCacheFactory func(zerolog.Logger, uint32, module.HeroCacheMetrics) alsp.SpamRecordCache 45 46 // SpamRecordDecayFunc is the function that calculates the decay of the spam record. 47 type SpamRecordDecayFunc func(model.ProtocolSpamRecord) float64 48 49 func defaultSpamRecordDecayFunc() SpamRecordDecayFunc { 50 return func(record model.ProtocolSpamRecord) float64 { 51 return math.Min(record.Penalty+record.Decay, 0) 52 } 53 } 54 55 // defaultSpamRecordCacheFactory is the default spam record cache factory. It creates a new spam record cache with the given parameter. 56 func defaultSpamRecordCacheFactory() SpamRecordCacheFactory { 57 return func(logger zerolog.Logger, size uint32, cacheMetrics module.HeroCacheMetrics) alsp.SpamRecordCache { 58 return internal.NewSpamRecordCache( 59 size, 60 logger.With().Str("component", "spam_record_cache").Logger(), 61 cacheMetrics, 62 model.SpamRecordFactory()) 63 } 64 } 65 66 // MisbehaviorReportManager is responsible for handling misbehavior reports, i.e., penalizing the misbehaving node 67 // and report the node to be disallow-listed if the overall penalty of the misbehaving node drops below the disallow-listing threshold. 68 type MisbehaviorReportManager struct { 69 component.Component 70 logger zerolog.Logger 71 metrics module.AlspMetrics 72 // cacheFactory is the factory for creating the spam record cache. MisbehaviorReportManager is coming with a 73 // default factory that creates a new spam record cache with the given parameter. However, this factory can be 74 // overridden with a custom factory. 75 cacheFactory SpamRecordCacheFactory 76 // cache is the spam record cache that stores the spam records for the authorized nodes. It is initialized by 77 // invoking the cacheFactory. 78 cache alsp.SpamRecordCache 79 // disablePenalty indicates whether applying the penalty to the misbehaving node is disabled. 80 // When disabled, the ALSP module logs the misbehavior reports and updates the metrics, but does not apply the penalty. 81 // This is useful for managing production incidents. 82 // Note: under normal circumstances, the ALSP module should not be disabled. 83 disablePenalty bool 84 85 // disallowListingConsumer is the consumer for the disallow-listing notifications. 86 // It is notified when a node is disallow-listed by this manager. 87 disallowListingConsumer network.DisallowListNotificationConsumer 88 89 // workerPool is the worker pool for handling the misbehavior reports in a thread-safe and non-blocking manner. 90 workerPool *worker.Pool[internal.ReportedMisbehaviorWork] 91 92 // decayFunc is the function that calculates the decay of the spam record. 93 decayFunc SpamRecordDecayFunc 94 } 95 96 var _ network.MisbehaviorReportManager = (*MisbehaviorReportManager)(nil) 97 98 type MisbehaviorReportManagerConfig struct { 99 Logger zerolog.Logger 100 // SpamRecordCacheSize is the size of the spam record cache that stores the spam records for the authorized nodes. 101 // It should be as big as the number of authorized nodes in Flow network. 102 // Recommendation: for small network sizes 10 * number of authorized nodes to ensure that the cache can hold all the spam records of the authorized nodes. 103 SpamRecordCacheSize uint32 104 // SpamReportQueueSize is the size of the queue that stores the spam records to be processed by the worker pool. 105 SpamReportQueueSize uint32 106 // AlspMetrics is the metrics instance for the alsp module (collecting spam reports). 107 AlspMetrics module.AlspMetrics 108 // HeroCacheMetricsFactory is the metrics factory for the HeroCache-related metrics. 109 // Having factory as part of the config allows to create the metrics locally in the module. 110 HeroCacheMetricsFactory metrics.HeroCacheMetricsFactory 111 // DisablePenalty indicates whether applying the penalty to the misbehaving node is disabled. 112 // When disabled, the ALSP module logs the misbehavior reports and updates the metrics, but does not apply the penalty. 113 // This is useful for managing production incidents. 114 // Note: under normal circumstances, the ALSP module should not be disabled. 115 DisablePenalty bool 116 // NetworkType is the type of the network it is used to determine whether the ALSP module is utilized in the 117 // public (unstaked) or private (staked) network. 118 NetworkType network.NetworkingType 119 // HeartBeatInterval is the interval between the heartbeats. Heartbeat is a recurring event that is used to 120 // apply recurring actions, e.g., decay the penalty of the misbehaving nodes. 121 HeartBeatInterval time.Duration 122 Opts []MisbehaviorReportManagerOption 123 } 124 125 // validate validates the MisbehaviorReportManagerConfig instance. It returns an error if the config is invalid. 126 // It only validates the numeric fields of the config that may yield a stealth error in the production. 127 // It does not validate the struct fields of the config against a nil value. 128 // Args: 129 // 130 // None. 131 // 132 // Returns: 133 // 134 // An error if the config is invalid. 135 func (c MisbehaviorReportManagerConfig) validate() error { 136 if c.SpamRecordCacheSize == 0 { 137 return ErrSpamRecordCacheSizeNotSet 138 } 139 if c.SpamReportQueueSize == 0 { 140 return ErrSpamReportQueueSizeNotSet 141 } 142 if c.HeartBeatInterval == 0 { 143 return ErrHeartBeatIntervalNotSet 144 } 145 return nil 146 } 147 148 type MisbehaviorReportManagerOption func(*MisbehaviorReportManager) 149 150 // NewMisbehaviorReportManager creates a new instance of the MisbehaviorReportManager. 151 // Args: 152 // cfg: the configuration for the MisbehaviorReportManager. 153 // consumer: the consumer for the disallow-listing notifications. When the manager decides to disallow-list a node, it notifies the consumer to 154 // perform the lower-level disallow-listing action at the networking layer. 155 // All connections to the disallow-listed node are closed and the node is removed from the overlay, and 156 // no further connections are established to the disallow-listed node, either inbound or outbound. 157 // 158 // Returns: 159 // 160 // A new instance of the MisbehaviorReportManager. 161 // An error if the config is invalid. The error is considered irrecoverable. 162 func NewMisbehaviorReportManager(cfg *MisbehaviorReportManagerConfig, consumer network.DisallowListNotificationConsumer) (*MisbehaviorReportManager, error) { 163 if err := cfg.validate(); err != nil { 164 return nil, fmt.Errorf("invalid configuration for MisbehaviorReportManager: %w", err) 165 } 166 167 lg := cfg.Logger.With().Str("module", "misbehavior_report_manager").Logger() 168 m := &MisbehaviorReportManager{ 169 logger: lg, 170 metrics: cfg.AlspMetrics, 171 disablePenalty: cfg.DisablePenalty, 172 disallowListingConsumer: consumer, 173 cacheFactory: defaultSpamRecordCacheFactory(), 174 decayFunc: defaultSpamRecordDecayFunc(), 175 } 176 177 store := queue.NewHeroStore( 178 cfg.SpamReportQueueSize, 179 lg.With().Str("component", "spam_record_queue").Logger(), 180 metrics.ApplicationLayerSpamRecordQueueMetricsFactory(cfg.HeroCacheMetricsFactory, cfg.NetworkType)) 181 182 m.workerPool = worker.NewWorkerPoolBuilder[internal.ReportedMisbehaviorWork]( 183 cfg.Logger, 184 store, 185 m.processMisbehaviorReport).Build() 186 187 for _, opt := range cfg.Opts { 188 opt(m) 189 } 190 191 m.cache = m.cacheFactory( 192 lg, 193 cfg.SpamRecordCacheSize, 194 metrics.ApplicationLayerSpamRecordCacheMetricFactory(cfg.HeroCacheMetricsFactory, cfg.NetworkType)) 195 196 builder := component.NewComponentManagerBuilder() 197 builder.AddWorker(func(ctx irrecoverable.SignalerContext, ready component.ReadyFunc) { 198 ready() 199 m.heartbeatLoop(ctx, cfg.HeartBeatInterval) // blocking call 200 }) 201 for i := 0; i < defaultMisbehaviorReportManagerWorkers; i++ { 202 builder.AddWorker(m.workerPool.WorkerLogic()) 203 } 204 205 m.Component = builder.Build() 206 207 if m.disablePenalty { 208 m.logger.Warn().Msg("penalty mechanism of alsp is disabled") 209 } 210 return m, nil 211 } 212 213 // HandleMisbehaviorReport is called upon a new misbehavior is reported. 214 // The implementation of this function should be thread-safe and non-blocking. 215 // Args: 216 // 217 // channel: the channel on which the misbehavior is reported. 218 // report: the misbehavior report. 219 // 220 // Returns: 221 // 222 // none. 223 func (m *MisbehaviorReportManager) HandleMisbehaviorReport(channel channels.Channel, report network.MisbehaviorReport) { 224 lg := m.logger.With(). 225 Str("channel", channel.String()). 226 Hex("misbehaving_id", logging.ID(report.OriginId())). 227 Str("reason", report.Reason().String()). 228 Float64("penalty", report.Penalty()).Logger() 229 lg.Trace().Msg("received misbehavior report") 230 m.metrics.OnMisbehaviorReported(channel.String(), report.Reason().String()) 231 232 nonce := [internal.NonceSize]byte{} 233 nonceSize, err := crand.Read(nonce[:]) 234 if err != nil { 235 // this should never happen, but if it does, we should not continue 236 lg.Fatal().Err(err).Msg("failed to generate nonce") 237 return 238 } 239 if nonceSize != internal.NonceSize { 240 // this should never happen, but if it does, we should not continue 241 lg.Fatal().Msgf("nonce size mismatch: expected %d, got %d", internal.NonceSize, nonceSize) 242 return 243 } 244 245 if ok := m.workerPool.Submit(internal.ReportedMisbehaviorWork{ 246 Channel: channel, 247 OriginId: report.OriginId(), 248 Reason: report.Reason(), 249 Penalty: report.Penalty(), 250 Nonce: nonce, 251 }); !ok { 252 lg.Warn().Msg("discarding misbehavior report because either the queue is full or the misbehavior report is duplicate") 253 } 254 255 lg.Debug().Msg("misbehavior report submitted") 256 } 257 258 // heartbeatLoop starts the heartbeat ticks ticker to tick at the given intervals. It is a blocking function, and 259 // should be called in a separate goroutine. It returns when the context is canceled. Hearbeats are recurring events that 260 // are used to perform periodic tasks. 261 // Args: 262 // 263 // ctx: the context. 264 // interval: the interval between two ticks. 265 // 266 // Returns: 267 // 268 // none. 269 func (m *MisbehaviorReportManager) heartbeatLoop(ctx irrecoverable.SignalerContext, interval time.Duration) { 270 ticker := time.NewTicker(interval) 271 m.logger.Info().Dur("interval", interval).Msg("starting heartbeat ticks") 272 defer ticker.Stop() 273 for { 274 select { 275 case <-ctx.Done(): 276 m.logger.Debug().Msg("heartbeat ticks stopped") 277 return 278 case <-ticker.C: 279 m.logger.Trace().Msg("new heartbeat ticked") 280 if err := m.onHeartbeat(); err != nil { 281 // any error returned from onHeartbeat is considered irrecoverable. 282 ctx.Throw(fmt.Errorf("failed to perform heartbeat: %w", err)) 283 } 284 } 285 } 286 } 287 288 // onHeartbeat is called upon a heartbeatLoop. It encapsulates the recurring tasks that should be performed 289 // during a heartbeat, which currently includes decay of the spam records. 290 // Args: 291 // 292 // none. 293 // 294 // Returns: 295 // 296 // error: if an error occurs, it is returned. No error is expected during normal operation. Any returned error must 297 // be considered as irrecoverable. 298 func (m *MisbehaviorReportManager) onHeartbeat() error { 299 allIds := m.cache.Identities() 300 301 for _, id := range allIds { 302 m.logger.Trace().Hex("identifier", logging.ID(id)).Msg("onHeartbeat - looping through spam records") 303 penalty, err := m.cache.AdjustWithInit(id, func(record model.ProtocolSpamRecord) (model.ProtocolSpamRecord, error) { 304 if record.Penalty > 0 { 305 // sanity check; this should never happen. 306 return record, fmt.Errorf("illegal state: spam record %x has positive penalty %f", id, record.Penalty) 307 } 308 if record.Decay <= 0 { 309 // sanity check; this should never happen. 310 return record, fmt.Errorf("illegal state: spam record %x has non-positive decay %f", id, record.Decay) 311 } 312 313 // TODO: this can be done in batch but at this stage let's send individual notifications. 314 // (it requires enabling the batch mode end-to-end including the cache in network). 315 // as long as record.Penalty is NOT below model.DisallowListingThreshold, 316 // the node is considered allow-listed and can conduct inbound and outbound connections. 317 // Once it falls below model.DisallowListingThreshold, it needs to be disallow listed. 318 if record.Penalty < model.DisallowListingThreshold && !record.DisallowListed { 319 // cutoff counter keeps track of how many times the penalty has been below the threshold. 320 record.CutoffCounter++ 321 record.DisallowListed = true 322 // Adjusts decay dynamically based on how many times the node was disallow-listed (cutoff). 323 record.Decay = m.adjustDecayFunc(record.CutoffCounter) 324 m.logger.Warn(). 325 Str("key", logging.KeySuspicious). 326 Hex("identifier", logging.ID(id)). 327 Float64("penalty", record.Penalty). 328 Uint64("cutoff_counter", record.CutoffCounter). 329 Float64("decay_speed", record.Decay). 330 Bool("disallow_listed", record.DisallowListed). 331 Msg("node penalty dropped below threshold, initiating disallow listing") 332 m.disallowListingConsumer.OnDisallowListNotification(&network.DisallowListingUpdate{ 333 FlowIds: flow.IdentifierList{id}, 334 Cause: network.DisallowListedCauseAlsp, // sets the ALSP disallow listing cause on node 335 }) 336 } 337 // each time we decay the penalty by the decay speed, the penalty is a negative number, and the decay speed 338 // is a positive number. So the penalty is getting closer to zero. 339 // We use math.Min() to make sure the penalty is never positive. 340 m.logger.Trace(). 341 Hex("identifier", logging.ID(id)). 342 Uint64("cutoff_counter", record.CutoffCounter). 343 Bool("disallow_listed", record.DisallowListed). 344 Float64("penalty", record.Penalty). 345 Msg("heartbeat interval, pulled the spam record for decaying") 346 record.Penalty = m.decayFunc(record) 347 m.logger.Trace(). 348 Hex("identifier", logging.ID(id)). 349 Uint64("cutoff_counter", record.CutoffCounter). 350 Bool("disallow_listed", record.DisallowListed). 351 Float64("penalty", record.Penalty). 352 Msg("heartbeat interval, spam record penalty adjusted by decay function") 353 354 // TODO: this can be done in batch but at this stage let's send individual notifications. 355 // (it requires enabling the batch mode end-to-end including the cache in network). 356 if record.Penalty == float64(0) && record.DisallowListed { 357 record.DisallowListed = false 358 359 m.logger.Info(). 360 Hex("identifier", logging.ID(id)). 361 Uint64("cutoff_counter", record.CutoffCounter). 362 Float64("decay_speed", record.Decay). 363 Bool("disallow_listed", record.DisallowListed). 364 Msg("allow-listing a node that was disallow listed") 365 // Penalty has fully decayed to zero and the node can be back in the allow list. 366 m.disallowListingConsumer.OnAllowListNotification(&network.AllowListingUpdate{ 367 FlowIds: flow.IdentifierList{id}, 368 Cause: network.DisallowListedCauseAlsp, // clears the ALSP disallow listing cause from node 369 }) 370 } 371 372 m.logger.Trace(). 373 Hex("identifier", logging.ID(id)). 374 Uint64("cutoff_counter", record.CutoffCounter). 375 Float64("decay_speed", record.Decay). 376 Bool("disallow_listed", record.DisallowListed). 377 Msg("spam record decayed successfully") 378 return record, nil 379 }) 380 381 // any error here is fatal because it indicates a bug in the cache. All ids being iterated over are in the cache, 382 // and adjust function above should not return an error unless there is a bug. 383 if err != nil { 384 return fmt.Errorf("failed to decay spam record %x: %w", id, err) 385 } 386 387 m.logger.Trace(). 388 Hex("identifier", logging.ID(id)). 389 Float64("updated_penalty", penalty). 390 Msg("spam record decayed") 391 } 392 393 return nil 394 } 395 396 // processMisbehaviorReport is the worker function that processes the misbehavior reports. 397 // It is called by the worker pool. 398 // It applies the penalty to the misbehaving node and updates the spam record cache. 399 // Implementation must be thread-safe so that it can be called concurrently. 400 // Args: 401 // 402 // report: the misbehavior report to be processed. 403 // 404 // Returns: 405 // 406 // error: the error that occurred during the processing of the misbehavior report. The returned error is 407 // irrecoverable and the node should crash if it occurs (indicating a bug in the ALSP module). 408 func (m *MisbehaviorReportManager) processMisbehaviorReport(report internal.ReportedMisbehaviorWork) error { 409 lg := m.logger.With(). 410 Str("channel", report.Channel.String()). 411 Hex("misbehaving_id", logging.ID(report.OriginId)). 412 Str("reason", report.Reason.String()). 413 Float64("penalty", report.Penalty).Logger() 414 415 if m.disablePenalty { 416 // when penalty mechanism disabled, the misbehavior is logged and metrics are updated, 417 // but no further actions are taken. 418 lg.Trace().Msg("discarding misbehavior report because alsp penalty is disabled") 419 return nil 420 } 421 422 // Adjust will first try to apply the penalty to the spam record, if it does not exist, the Adjust method will initialize 423 // a spam record for the peer first and then applies the penalty. In other words, Adjust uses an optimistic update by 424 // first assuming that the spam record exists and then initializing it if it does not exist. In this way, we avoid 425 // acquiring the lock twice per misbehavior report, reducing the contention on the lock and improving the performance. 426 updatedPenalty, err := m.cache.AdjustWithInit(report.OriginId, func(record model.ProtocolSpamRecord) (model.ProtocolSpamRecord, error) { 427 if report.Penalty > 0 { 428 // this should never happen, unless there is a bug in the misbehavior report handling logic. 429 // we should crash the node in this case to prevent further misbehavior reports from being lost and fix the bug. 430 // we return the error as it is considered as a fatal error. 431 return record, fmt.Errorf("penalty value is positive, expected negative %f", report.Penalty) 432 } 433 record.Penalty += report.Penalty // penalty value is negative. We add it to the current penalty. 434 lg = lg.With(). 435 Float64("penalty_before_update", record.Penalty). 436 Uint64("cutoff_counter", record.CutoffCounter). 437 Float64("decay_speed", record.Decay). 438 Bool("disallow_listed", record.DisallowListed). 439 Logger() 440 return record, nil 441 }) 442 if err != nil { 443 // this should never happen, unless there is a bug in the spam record cache implementation. 444 // we should crash the node in this case to prevent further misbehavior reports from being lost and fix the bug. 445 return fmt.Errorf("failed to apply penalty to the spam record: %w", err) 446 } 447 lg.Debug().Float64("updated_penalty", updatedPenalty).Msg("misbehavior report handled") 448 return nil 449 } 450 451 // adjustDecayFunc calculates the decay value of the spam record cache. This allows the decay to be different on subsequent disallow listings. 452 // It returns the decay speed for the given cutoff counter. 453 // The cutoff counter is the number of times that the node has been disallow-listed. 454 // Args: 455 // cutoffCounter: the number of times that the node has been disallow-listed including the current time. Note that the cutoff counter 456 // must always be updated before calling this function. 457 // 458 // Returns: 459 // 460 // float64: the decay speed for the given cutoff counter. 461 func (m *MisbehaviorReportManager) adjustDecayFunc(cutoffCounter uint64) float64 { 462 // decaySpeeds illustrates the decay speeds for different cutoff counters. 463 // The first cutoff does not reduce the decay speed (1000 -> 1000). However, the second, third, 464 // and forth cutoffs reduce the decay speed by 90% (1000 -> 100, 100 -> 10, 10 -> 1). 465 // All subsequent cutoffs after the fourth cutoff use the last decay speed (1). 466 // This is to prevent the decay speed from becoming too small and the spam record from taking too long to decay. 467 switch { 468 case cutoffCounter == 1: 469 return 1000 470 case cutoffCounter == 2: 471 return 100 472 case cutoffCounter == 3: 473 return 10 474 case cutoffCounter >= 4: 475 return 1 476 default: 477 panic(fmt.Sprintf("illegal-state cutoff counter must be positive, it should include the current time: %d", cutoffCounter)) 478 } 479 } 480 481 // WithSpamRecordsCacheFactory sets the spam record cache factory for the MisbehaviorReportManager. 482 // Args: 483 // 484 // f: the spam record cache factory. 485 // 486 // Returns: 487 // 488 // a MisbehaviorReportManagerOption that sets the spam record cache for the MisbehaviorReportManager. 489 // 490 // Note: this option is useful primarily for testing purposes. The default factory should be sufficient for production. 491 func WithSpamRecordsCacheFactory(f SpamRecordCacheFactory) MisbehaviorReportManagerOption { 492 return func(m *MisbehaviorReportManager) { 493 m.cacheFactory = f 494 } 495 } 496 497 // WithDecayFunc sets the decay function for the MisbehaviorReportManager. Useful for testing purposes to simulate the decay of the penalty without waiting for the actual decay. 498 // Args: 499 // 500 // f: the decay function. 501 // 502 // Returns: 503 // 504 // a MisbehaviorReportManagerOption that sets the decay function for the MisbehaviorReportManager. 505 // 506 // Note: this option is useful primarily for testing purposes. The default decay function should be used for production. 507 func WithDecayFunc(f SpamRecordDecayFunc) MisbehaviorReportManagerOption { 508 return func(m *MisbehaviorReportManager) { 509 m.decayFunc = f 510 } 511 }