github.com/asynkron/protoactor-go@v0.0.0-20240308120642-ef91a6abee75/cluster/gossiper.go (about)

     1  // Copyright (C) 2015-2022 Asynkron AB All rights reserved
     2  
     3  package cluster
     4  
     5  import (
     6  	"errors"
     7  	"fmt"
     8  	"log/slog"
     9  	"strings"
    10  	"time"
    11  
    12  	"github.com/asynkron/protoactor-go/remote"
    13  
    14  	"github.com/asynkron/gofun/set"
    15  	"google.golang.org/protobuf/proto"
    16  
    17  	"github.com/asynkron/protoactor-go/actor"
    18  	"google.golang.org/protobuf/types/known/anypb"
    19  )
    20  
    21  const DefaultGossipActorName string = "gossip"
    22  
    23  // GossipUpdate Used to update gossip data when a ClusterTopology event occurs
    24  type GossipUpdate struct {
    25  	MemberID, Key string
    26  	Value         *anypb.Any
    27  	SeqNumber     int64
    28  }
    29  
    30  // ConsensusChecker Customary type used to provide consensus check callbacks of any type
    31  // note: this is equivalent to (for future go v1.18):
    32  //
    33  //	type ConsensusChecker[T] func(GossipState, map[string]empty) (bool, T)
    34  type ConsensusChecker func(*GossipState, map[string]empty) (bool, interface{})
    35  
    36  // The Gossiper data structure manages Gossip
    37  type Gossiper struct {
    38  	// The Gossiper Actor Name, defaults to "gossip"
    39  	GossipActorName string
    40  
    41  	// The Gossiper Cluster
    42  	cluster *Cluster
    43  
    44  	// The actor PID
    45  	pid *actor.PID
    46  
    47  	// Channel use to stop the gossip loop
    48  	close chan struct{}
    49  
    50  	// Message throttler
    51  	throttler actor.ShouldThrottle
    52  }
    53  
    54  // Creates a new Gossiper value and return it back
    55  func newGossiper(cl *Cluster, opts ...Option) (*Gossiper, error) {
    56  	// create a new Gossiper value
    57  	gossiper := &Gossiper{
    58  		GossipActorName: DefaultGossipActorName,
    59  		cluster:         cl,
    60  		close:           make(chan struct{}),
    61  	}
    62  
    63  	// apply any given options
    64  	for _, opt := range opts {
    65  		opt(gossiper)
    66  	}
    67  
    68  	return gossiper, nil
    69  }
    70  
    71  func (g *Gossiper) GetState(key string) (map[string]*GossipKeyValue, error) {
    72  	if g.throttler() == actor.Open {
    73  		g.cluster.Logger().Debug(fmt.Sprintf("Gossiper getting state from %s", g.pid))
    74  	}
    75  
    76  	msg := NewGetGossipStateRequest(key)
    77  	timeout := g.cluster.Config.TimeoutTime
    78  	r, err := g.cluster.ActorSystem.Root.RequestFuture(g.pid, &msg, timeout).Result()
    79  	if err != nil {
    80  		switch err {
    81  		case actor.ErrTimeout:
    82  			g.cluster.Logger().Error("Could not get a response from GossipActor: request timeout", slog.Any("error", err), slog.String("remote", g.pid.String()))
    83  			return nil, err
    84  		case actor.ErrDeadLetter:
    85  			g.cluster.Logger().Error("remote no longer exists", slog.Any("error", err), slog.String("remote", g.pid.String()))
    86  			return nil, err
    87  		default:
    88  			g.cluster.Logger().Error("Could not get a response from GossipActor", slog.Any("error", err), slog.String("remote", g.pid.String()))
    89  			return nil, err
    90  		}
    91  	}
    92  
    93  	// try to cast the response to GetGossipStateResponse concrete value
    94  	response, ok := r.(*GetGossipStateResponse)
    95  	if !ok {
    96  		err := fmt.Errorf("could not promote %T interface to GetGossipStateResponse", r)
    97  		g.cluster.Logger().Error("Could not get a response from GossipActor", slog.Any("error", err), slog.String("remote", g.pid.String()))
    98  		return nil, err
    99  	}
   100  
   101  	return response.State, nil
   102  }
   103  
   104  // SetState Sends fire and forget message to update member state
   105  func (g *Gossiper) SetState(key string, value proto.Message) {
   106  	if g.throttler() == actor.Open {
   107  		g.cluster.Logger().Debug(fmt.Sprintf("Gossiper setting state %s to %s", key, g.pid))
   108  	}
   109  
   110  	if g.pid == nil {
   111  		return
   112  	}
   113  
   114  	msg := NewGossipStateKey(key, value)
   115  	g.cluster.ActorSystem.Root.Send(g.pid, &msg)
   116  }
   117  
   118  // SetStateRequest Sends a Request (that blocks) to update member state
   119  func (g *Gossiper) SetStateRequest(key string, value proto.Message) error {
   120  	if g.throttler() == actor.Open {
   121  		g.cluster.Logger().Debug(fmt.Sprintf("Gossiper setting state %s to %s", key, g.pid))
   122  	}
   123  
   124  	if g.pid == nil {
   125  		return errors.New("gossiper Actor PID is nil")
   126  	}
   127  
   128  	msg := NewGossipStateKey(key, value)
   129  	r, err := g.cluster.ActorSystem.Root.RequestFuture(g.pid, &msg, g.cluster.Config.TimeoutTime).Result()
   130  	if err != nil {
   131  		if err == actor.ErrTimeout {
   132  			g.cluster.Logger().Error("Could not get a response from Gossiper Actor: request timeout", slog.String("remote", g.pid.String()))
   133  			return err
   134  		}
   135  		g.cluster.Logger().Error("Could not get a response from Gossiper Actor", slog.Any("error", err), slog.String("remote", g.pid.String()))
   136  		return err
   137  	}
   138  
   139  	// try to cast the response to SetGossipStateResponse concrete value
   140  	_, ok := r.(*SetGossipStateResponse)
   141  	if !ok {
   142  		err := fmt.Errorf("could not promote %T interface to SetGossipStateResponse", r)
   143  		g.cluster.Logger().Error("Could not get a response from Gossip Actor", slog.Any("error", err), slog.String("remote", g.pid.String()))
   144  		return err
   145  	}
   146  	return nil
   147  }
   148  
   149  func (g *Gossiper) SendState() {
   150  	if g.pid == nil {
   151  		return
   152  	}
   153  
   154  	r, err := g.cluster.ActorSystem.Root.RequestFuture(g.pid, &SendGossipStateRequest{}, 5*time.Second).Result()
   155  	if err != nil {
   156  		g.cluster.Logger().Warn("Gossip could not send gossip request", slog.Any("PID", g.pid), slog.Any("error", err))
   157  		return
   158  	}
   159  
   160  	if _, ok := r.(*SendGossipStateResponse); !ok {
   161  		g.cluster.Logger().Error("Gossip SendState received unknown response", slog.Any("message", r))
   162  	}
   163  }
   164  
   165  // RegisterConsensusCheck Builds a consensus handler and a consensus checker, send the checker to the
   166  // Gossip actor and returns the handler back to the caller
   167  func (g *Gossiper) RegisterConsensusCheck(key string, getValue func(*anypb.Any) interface{}) ConsensusHandler {
   168  	definition := NewConsensusCheckBuilder(g.cluster.Logger(), key, getValue)
   169  	consensusHandle, check := definition.Build()
   170  	request := NewAddConsensusCheck(consensusHandle.GetID(), check)
   171  	g.cluster.ActorSystem.Root.Send(g.pid, &request)
   172  	return consensusHandle
   173  }
   174  
   175  func (g *Gossiper) StartGossiping() error {
   176  	var err error
   177  	g.pid, err = g.cluster.ActorSystem.Root.SpawnNamed(actor.PropsFromProducerWithActorSystem(func(system *actor.ActorSystem) actor.Actor {
   178  		return NewGossipActor(
   179  			g.cluster.Config.GossipRequestTimeout,
   180  			g.cluster.ActorSystem.ID,
   181  			func() set.Set[string] {
   182  				return g.cluster.GetBlockedMembers()
   183  			},
   184  			g.cluster.Config.GossipFanOut,
   185  			g.cluster.Config.GossipMaxSend,
   186  			system,
   187  		)
   188  	}), g.GossipActorName)
   189  
   190  	if err != nil {
   191  		g.cluster.Logger().Error("Failed to start gossip actor", slog.Any("error", err))
   192  		return err
   193  	}
   194  
   195  	g.cluster.ActorSystem.EventStream.Subscribe(func(evt interface{}) {
   196  		if topology, ok := evt.(*ClusterTopology); ok {
   197  			g.cluster.ActorSystem.Root.Send(g.pid, topology)
   198  		}
   199  	})
   200  	g.cluster.Logger().Info("Started Cluster Gossip")
   201  	g.throttler = actor.NewThrottle(3, 60*time.Second, g.throttledLog)
   202  	go g.gossipLoop()
   203  
   204  	return nil
   205  }
   206  
   207  func (g *Gossiper) Shutdown() {
   208  	if g.pid == nil {
   209  		return
   210  	}
   211  
   212  	g.cluster.Logger().Info("Shutting down gossip")
   213  
   214  	close(g.close)
   215  
   216  	err := g.cluster.ActorSystem.Root.StopFuture(g.pid).Wait()
   217  	if err != nil {
   218  		g.cluster.Logger().Error("failed to stop gossip actor", slog.Any("error", err))
   219  	}
   220  
   221  	g.cluster.Logger().Info("Shut down gossip")
   222  }
   223  
   224  func (g *Gossiper) gossipLoop() {
   225  	g.cluster.Logger().Info("Starting gossip loop")
   226  
   227  	// create a ticker that will tick each GossipInterval milliseconds
   228  	// we do not use sleep as sleep puts the goroutine out of the scheduler
   229  	// P, and we do not want our Gs to be scheduled out from the running Ms
   230  	ticker := time.NewTicker(g.cluster.Config.GossipInterval)
   231  breakLoop:
   232  	for !g.cluster.ActorSystem.IsStopped() {
   233  		select {
   234  		case <-g.close:
   235  			g.cluster.Logger().Info("Stopping Gossip Loop")
   236  			break breakLoop
   237  		case <-ticker.C:
   238  
   239  			g.blockExpiredHeartbeats()
   240  			g.blockGracefullyLeft()
   241  
   242  			g.SetState(HearthbeatKey, &MemberHeartbeat{
   243  				// todo collect the actor statistics
   244  				ActorStatistics: &ActorStatistics{},
   245  			})
   246  			g.SendState()
   247  		}
   248  	}
   249  }
   250  
   251  // blockExpiredHeartbeats blocks members that have not sent a heartbeat for a long time
   252  func (g *Gossiper) blockExpiredHeartbeats() {
   253  	if g.cluster.Config.GossipInterval == 0 {
   254  		return
   255  	}
   256  	t, err := g.GetState(HearthbeatKey)
   257  	if err != nil {
   258  		g.cluster.Logger().Error("Could not get heartbeat state", slog.Any("error", err))
   259  		return
   260  	}
   261  
   262  	blockList := remote.GetRemote(g.cluster.ActorSystem).BlockList()
   263  
   264  	blocked := make([]string, 0)
   265  
   266  	for k, v := range t {
   267  		if k != g.cluster.ActorSystem.ID &&
   268  			!blockList.IsBlocked(k) &&
   269  			time.Now().Sub(time.UnixMilli(v.LocalTimestampUnixMilliseconds)) > g.cluster.Config.HeartbeatExpiration {
   270  			blocked = append(blocked, k)
   271  		}
   272  	}
   273  
   274  	if len(blocked) > 0 {
   275  		g.cluster.Logger().Info("Blocking members due to expired heartbeat", slog.String("members", strings.Join(blocked, ",")))
   276  		blockList.Block(blocked...)
   277  	}
   278  }
   279  
   280  // blockGracefullyLeft blocking members due to gracefully leaving
   281  func (g *Gossiper) blockGracefullyLeft() {
   282  	t, err := g.GetState(GracefullyLeftKey)
   283  	if err != nil {
   284  		g.cluster.Logger().Error("Could not get gracefully left members", slog.Any("error", err))
   285  		return
   286  	}
   287  
   288  	blockList := remote.GetRemote(g.cluster.ActorSystem).BlockList()
   289  
   290  	gracefullyLeft := make([]string, 0)
   291  	for k := range t {
   292  		if !blockList.IsBlocked(k) && k != g.cluster.ActorSystem.ID {
   293  			gracefullyLeft = append(gracefullyLeft, k)
   294  		}
   295  	}
   296  	if len(gracefullyLeft) > 0 {
   297  		g.cluster.Logger().Info("Blocking members due to gracefully leaving", slog.String("members", strings.Join(gracefullyLeft, ",")))
   298  		blockList.Block(gracefullyLeft...)
   299  	}
   300  }
   301  
   302  func (g *Gossiper) throttledLog(counter int32) {
   303  	g.cluster.Logger().Debug(fmt.Sprintf("[Gossiper] Gossiper Setting State to %s", g.pid), slog.Int("throttled", int(counter)))
   304  }