github.com/asynkron/protoactor-go@v0.0.0-20240308120642-ef91a6abee75/cluster/gossiper.go (about) 1 // Copyright (C) 2015-2022 Asynkron AB All rights reserved 2 3 package cluster 4 5 import ( 6 "errors" 7 "fmt" 8 "log/slog" 9 "strings" 10 "time" 11 12 "github.com/asynkron/protoactor-go/remote" 13 14 "github.com/asynkron/gofun/set" 15 "google.golang.org/protobuf/proto" 16 17 "github.com/asynkron/protoactor-go/actor" 18 "google.golang.org/protobuf/types/known/anypb" 19 ) 20 21 const DefaultGossipActorName string = "gossip" 22 23 // GossipUpdate Used to update gossip data when a ClusterTopology event occurs 24 type GossipUpdate struct { 25 MemberID, Key string 26 Value *anypb.Any 27 SeqNumber int64 28 } 29 30 // ConsensusChecker Customary type used to provide consensus check callbacks of any type 31 // note: this is equivalent to (for future go v1.18): 32 // 33 // type ConsensusChecker[T] func(GossipState, map[string]empty) (bool, T) 34 type ConsensusChecker func(*GossipState, map[string]empty) (bool, interface{}) 35 36 // The Gossiper data structure manages Gossip 37 type Gossiper struct { 38 // The Gossiper Actor Name, defaults to "gossip" 39 GossipActorName string 40 41 // The Gossiper Cluster 42 cluster *Cluster 43 44 // The actor PID 45 pid *actor.PID 46 47 // Channel use to stop the gossip loop 48 close chan struct{} 49 50 // Message throttler 51 throttler actor.ShouldThrottle 52 } 53 54 // Creates a new Gossiper value and return it back 55 func newGossiper(cl *Cluster, opts ...Option) (*Gossiper, error) { 56 // create a new Gossiper value 57 gossiper := &Gossiper{ 58 GossipActorName: DefaultGossipActorName, 59 cluster: cl, 60 close: make(chan struct{}), 61 } 62 63 // apply any given options 64 for _, opt := range opts { 65 opt(gossiper) 66 } 67 68 return gossiper, nil 69 } 70 71 func (g *Gossiper) GetState(key string) (map[string]*GossipKeyValue, error) { 72 if g.throttler() == actor.Open { 73 g.cluster.Logger().Debug(fmt.Sprintf("Gossiper getting state from %s", g.pid)) 74 } 75 76 msg := NewGetGossipStateRequest(key) 77 timeout := g.cluster.Config.TimeoutTime 78 r, err := g.cluster.ActorSystem.Root.RequestFuture(g.pid, &msg, timeout).Result() 79 if err != nil { 80 switch err { 81 case actor.ErrTimeout: 82 g.cluster.Logger().Error("Could not get a response from GossipActor: request timeout", slog.Any("error", err), slog.String("remote", g.pid.String())) 83 return nil, err 84 case actor.ErrDeadLetter: 85 g.cluster.Logger().Error("remote no longer exists", slog.Any("error", err), slog.String("remote", g.pid.String())) 86 return nil, err 87 default: 88 g.cluster.Logger().Error("Could not get a response from GossipActor", slog.Any("error", err), slog.String("remote", g.pid.String())) 89 return nil, err 90 } 91 } 92 93 // try to cast the response to GetGossipStateResponse concrete value 94 response, ok := r.(*GetGossipStateResponse) 95 if !ok { 96 err := fmt.Errorf("could not promote %T interface to GetGossipStateResponse", r) 97 g.cluster.Logger().Error("Could not get a response from GossipActor", slog.Any("error", err), slog.String("remote", g.pid.String())) 98 return nil, err 99 } 100 101 return response.State, nil 102 } 103 104 // SetState Sends fire and forget message to update member state 105 func (g *Gossiper) SetState(key string, value proto.Message) { 106 if g.throttler() == actor.Open { 107 g.cluster.Logger().Debug(fmt.Sprintf("Gossiper setting state %s to %s", key, g.pid)) 108 } 109 110 if g.pid == nil { 111 return 112 } 113 114 msg := NewGossipStateKey(key, value) 115 g.cluster.ActorSystem.Root.Send(g.pid, &msg) 116 } 117 118 // SetStateRequest Sends a Request (that blocks) to update member state 119 func (g *Gossiper) SetStateRequest(key string, value proto.Message) error { 120 if g.throttler() == actor.Open { 121 g.cluster.Logger().Debug(fmt.Sprintf("Gossiper setting state %s to %s", key, g.pid)) 122 } 123 124 if g.pid == nil { 125 return errors.New("gossiper Actor PID is nil") 126 } 127 128 msg := NewGossipStateKey(key, value) 129 r, err := g.cluster.ActorSystem.Root.RequestFuture(g.pid, &msg, g.cluster.Config.TimeoutTime).Result() 130 if err != nil { 131 if err == actor.ErrTimeout { 132 g.cluster.Logger().Error("Could not get a response from Gossiper Actor: request timeout", slog.String("remote", g.pid.String())) 133 return err 134 } 135 g.cluster.Logger().Error("Could not get a response from Gossiper Actor", slog.Any("error", err), slog.String("remote", g.pid.String())) 136 return err 137 } 138 139 // try to cast the response to SetGossipStateResponse concrete value 140 _, ok := r.(*SetGossipStateResponse) 141 if !ok { 142 err := fmt.Errorf("could not promote %T interface to SetGossipStateResponse", r) 143 g.cluster.Logger().Error("Could not get a response from Gossip Actor", slog.Any("error", err), slog.String("remote", g.pid.String())) 144 return err 145 } 146 return nil 147 } 148 149 func (g *Gossiper) SendState() { 150 if g.pid == nil { 151 return 152 } 153 154 r, err := g.cluster.ActorSystem.Root.RequestFuture(g.pid, &SendGossipStateRequest{}, 5*time.Second).Result() 155 if err != nil { 156 g.cluster.Logger().Warn("Gossip could not send gossip request", slog.Any("PID", g.pid), slog.Any("error", err)) 157 return 158 } 159 160 if _, ok := r.(*SendGossipStateResponse); !ok { 161 g.cluster.Logger().Error("Gossip SendState received unknown response", slog.Any("message", r)) 162 } 163 } 164 165 // RegisterConsensusCheck Builds a consensus handler and a consensus checker, send the checker to the 166 // Gossip actor and returns the handler back to the caller 167 func (g *Gossiper) RegisterConsensusCheck(key string, getValue func(*anypb.Any) interface{}) ConsensusHandler { 168 definition := NewConsensusCheckBuilder(g.cluster.Logger(), key, getValue) 169 consensusHandle, check := definition.Build() 170 request := NewAddConsensusCheck(consensusHandle.GetID(), check) 171 g.cluster.ActorSystem.Root.Send(g.pid, &request) 172 return consensusHandle 173 } 174 175 func (g *Gossiper) StartGossiping() error { 176 var err error 177 g.pid, err = g.cluster.ActorSystem.Root.SpawnNamed(actor.PropsFromProducerWithActorSystem(func(system *actor.ActorSystem) actor.Actor { 178 return NewGossipActor( 179 g.cluster.Config.GossipRequestTimeout, 180 g.cluster.ActorSystem.ID, 181 func() set.Set[string] { 182 return g.cluster.GetBlockedMembers() 183 }, 184 g.cluster.Config.GossipFanOut, 185 g.cluster.Config.GossipMaxSend, 186 system, 187 ) 188 }), g.GossipActorName) 189 190 if err != nil { 191 g.cluster.Logger().Error("Failed to start gossip actor", slog.Any("error", err)) 192 return err 193 } 194 195 g.cluster.ActorSystem.EventStream.Subscribe(func(evt interface{}) { 196 if topology, ok := evt.(*ClusterTopology); ok { 197 g.cluster.ActorSystem.Root.Send(g.pid, topology) 198 } 199 }) 200 g.cluster.Logger().Info("Started Cluster Gossip") 201 g.throttler = actor.NewThrottle(3, 60*time.Second, g.throttledLog) 202 go g.gossipLoop() 203 204 return nil 205 } 206 207 func (g *Gossiper) Shutdown() { 208 if g.pid == nil { 209 return 210 } 211 212 g.cluster.Logger().Info("Shutting down gossip") 213 214 close(g.close) 215 216 err := g.cluster.ActorSystem.Root.StopFuture(g.pid).Wait() 217 if err != nil { 218 g.cluster.Logger().Error("failed to stop gossip actor", slog.Any("error", err)) 219 } 220 221 g.cluster.Logger().Info("Shut down gossip") 222 } 223 224 func (g *Gossiper) gossipLoop() { 225 g.cluster.Logger().Info("Starting gossip loop") 226 227 // create a ticker that will tick each GossipInterval milliseconds 228 // we do not use sleep as sleep puts the goroutine out of the scheduler 229 // P, and we do not want our Gs to be scheduled out from the running Ms 230 ticker := time.NewTicker(g.cluster.Config.GossipInterval) 231 breakLoop: 232 for !g.cluster.ActorSystem.IsStopped() { 233 select { 234 case <-g.close: 235 g.cluster.Logger().Info("Stopping Gossip Loop") 236 break breakLoop 237 case <-ticker.C: 238 239 g.blockExpiredHeartbeats() 240 g.blockGracefullyLeft() 241 242 g.SetState(HearthbeatKey, &MemberHeartbeat{ 243 // todo collect the actor statistics 244 ActorStatistics: &ActorStatistics{}, 245 }) 246 g.SendState() 247 } 248 } 249 } 250 251 // blockExpiredHeartbeats blocks members that have not sent a heartbeat for a long time 252 func (g *Gossiper) blockExpiredHeartbeats() { 253 if g.cluster.Config.GossipInterval == 0 { 254 return 255 } 256 t, err := g.GetState(HearthbeatKey) 257 if err != nil { 258 g.cluster.Logger().Error("Could not get heartbeat state", slog.Any("error", err)) 259 return 260 } 261 262 blockList := remote.GetRemote(g.cluster.ActorSystem).BlockList() 263 264 blocked := make([]string, 0) 265 266 for k, v := range t { 267 if k != g.cluster.ActorSystem.ID && 268 !blockList.IsBlocked(k) && 269 time.Now().Sub(time.UnixMilli(v.LocalTimestampUnixMilliseconds)) > g.cluster.Config.HeartbeatExpiration { 270 blocked = append(blocked, k) 271 } 272 } 273 274 if len(blocked) > 0 { 275 g.cluster.Logger().Info("Blocking members due to expired heartbeat", slog.String("members", strings.Join(blocked, ","))) 276 blockList.Block(blocked...) 277 } 278 } 279 280 // blockGracefullyLeft blocking members due to gracefully leaving 281 func (g *Gossiper) blockGracefullyLeft() { 282 t, err := g.GetState(GracefullyLeftKey) 283 if err != nil { 284 g.cluster.Logger().Error("Could not get gracefully left members", slog.Any("error", err)) 285 return 286 } 287 288 blockList := remote.GetRemote(g.cluster.ActorSystem).BlockList() 289 290 gracefullyLeft := make([]string, 0) 291 for k := range t { 292 if !blockList.IsBlocked(k) && k != g.cluster.ActorSystem.ID { 293 gracefullyLeft = append(gracefullyLeft, k) 294 } 295 } 296 if len(gracefullyLeft) > 0 { 297 g.cluster.Logger().Info("Blocking members due to gracefully leaving", slog.String("members", strings.Join(gracefullyLeft, ","))) 298 blockList.Block(gracefullyLeft...) 299 } 300 } 301 302 func (g *Gossiper) throttledLog(counter int32) { 303 g.cluster.Logger().Debug(fmt.Sprintf("[Gossiper] Gossiper Setting State to %s", g.pid), slog.Int("throttled", int(counter))) 304 }