github.com/asynkron/protoactor-go@v0.0.0-20240308120642-ef91a6abee75/cluster/clusterproviders/automanaged/automanaged.go (about)

     1  package automanaged
     2  
     3  import (
     4  	"encoding/json"
     5  	"fmt"
     6  	"log/slog"
     7  	"net"
     8  	"net/http"
     9  	"sync"
    10  	"time"
    11  
    12  	"golang.org/x/net/context"
    13  
    14  	"github.com/asynkron/protoactor-go/cluster"
    15  	"github.com/labstack/echo"
    16  	"golang.org/x/sync/errgroup"
    17  )
    18  
    19  // TODO: needs to be attached to the provider instance
    20  var (
    21  	clusterTTLErrorMutex       = new(sync.Mutex)
    22  	clusterMonitorErrorMutex   = new(sync.Mutex)
    23  	shutdownMutex              = new(sync.Mutex)
    24  	deregisteredMutex          = new(sync.Mutex)
    25  	activeProviderMutex        = new(sync.Mutex)
    26  	activeProviderRunningMutex = new(sync.Mutex)
    27  )
    28  
    29  type AutoManagedProvider struct {
    30  	deregistered          bool
    31  	shutdown              bool
    32  	activeProvider        *echo.Echo
    33  	activeProviderRunning bool
    34  	activeProviderTesting bool
    35  	httpClient            *http.Client
    36  	monitoringStatus      bool
    37  	clusterName           string
    38  	address               string
    39  	autoManagePort        int
    40  	memberPort            int
    41  	knownKinds            []string
    42  	knownNodes            []*NodeModel
    43  	hosts                 []string
    44  	refreshTTL            time.Duration
    45  	clusterTTLError       error
    46  	clusterMonitorError   error
    47  	cluster               *cluster.Cluster
    48  }
    49  
    50  // New creates a AutoManagedProvider that connects locally
    51  func New() *AutoManagedProvider {
    52  	return NewWithConfig(
    53  		2*time.Second,
    54  		6330,
    55  		"localhost:6330",
    56  	)
    57  }
    58  
    59  // NewWithConfig creates an Automanaged Provider that connects to an all the hosts
    60  func NewWithConfig(refreshTTL time.Duration, autoManPort int, hosts ...string) *AutoManagedProvider {
    61  	transport := &http.Transport{
    62  		Proxy: http.ProxyFromEnvironment,
    63  		DialContext: (&net.Dialer{
    64  			Timeout:   5 * time.Second,
    65  			KeepAlive: 5 * time.Second,
    66  		}).DialContext,
    67  		MaxIdleConns:          10,
    68  		IdleConnTimeout:       90 * time.Second,
    69  		ExpectContinueTimeout: 1 * time.Second,
    70  		MaxConnsPerHost:       10,
    71  	}
    72  
    73  	httpClient := &http.Client{
    74  		Transport: transport,
    75  		Timeout:   2 * time.Second,
    76  	}
    77  
    78  	p := &AutoManagedProvider{
    79  		hosts:                 hosts,
    80  		httpClient:            httpClient,
    81  		refreshTTL:            refreshTTL,
    82  		autoManagePort:        autoManPort,
    83  		activeProviderRunning: false,
    84  		monitoringStatus:      false,
    85  	}
    86  
    87  	return p
    88  }
    89  
    90  // NewWithTesting creates a testable provider
    91  func NewWithTesting(refreshTTL time.Duration, autoManPort int, activeProvider *echo.Echo, hosts ...string) *AutoManagedProvider {
    92  	p := NewWithConfig(refreshTTL, autoManPort, hosts...)
    93  	p.activeProviderTesting = true
    94  	p.activeProvider = activeProvider
    95  	return p
    96  }
    97  
    98  func (p *AutoManagedProvider) init(cluster *cluster.Cluster) error {
    99  	host, port, err := cluster.ActorSystem.GetHostPort()
   100  	if err != nil {
   101  		return err
   102  	}
   103  
   104  	p.clusterName = cluster.Config.Name
   105  	p.address = host
   106  	p.memberPort = port
   107  	p.knownKinds = cluster.GetClusterKinds()
   108  	p.deregistered = false
   109  	p.shutdown = false
   110  	p.cluster = cluster
   111  	return nil
   112  }
   113  
   114  func (p *AutoManagedProvider) StartMember(cluster *cluster.Cluster) error {
   115  	if err := p.init(cluster); err != nil {
   116  		return err
   117  	}
   118  	p.UpdateTTL()
   119  	p.monitorMemberStatusChanges()
   120  	return nil
   121  }
   122  
   123  func (p *AutoManagedProvider) StartClient(cluster *cluster.Cluster) error {
   124  	if err := p.init(cluster); err != nil {
   125  		return err
   126  	}
   127  	// p.UpdateTTL()
   128  	p.monitorMemberStatusChanges()
   129  	return nil
   130  }
   131  
   132  // DeregisterMember set the shutdown to true preventing anymore TTL updates
   133  func (p *AutoManagedProvider) DeregisterMember() error {
   134  	deregisteredMutex.Lock()
   135  	defer deregisteredMutex.Unlock()
   136  
   137  	p.deregistered = true
   138  	return nil
   139  }
   140  
   141  // Shutdown set the shutdown to true preventing anymore TTL updates
   142  func (p *AutoManagedProvider) Shutdown(graceful bool) error {
   143  	shutdownMutex.Lock()
   144  	defer shutdownMutex.Unlock()
   145  
   146  	p.shutdown = true
   147  	p.activeProvider.Close()
   148  	return nil
   149  }
   150  
   151  // UpdateTTL sets up an endpoint to respond to other members
   152  func (p *AutoManagedProvider) UpdateTTL() {
   153  	activeProviderRunningMutex.Lock()
   154  	running := p.activeProviderRunning
   155  	activeProviderRunningMutex.Unlock()
   156  
   157  	if (p.isShutdown() || p.isDeregistered()) && running {
   158  		p.activeProvider.Close()
   159  		return
   160  	}
   161  
   162  	if running {
   163  		return
   164  	}
   165  
   166  	// it's not running, and it's not shutdown or de-registered
   167  	// it's also not a test (this should be refactored)
   168  
   169  	if !p.activeProviderTesting {
   170  		p.activeProvider = echo.New()
   171  		p.activeProvider.HideBanner = true
   172  		p.activeProvider.GET("/_health", func(context echo.Context) error {
   173  			return context.JSON(http.StatusOK, p.getCurrentNode())
   174  		})
   175  	}
   176  	go func() {
   177  		activeProviderRunningMutex.Lock()
   178  		p.activeProviderRunning = true
   179  		activeProviderRunningMutex.Unlock()
   180  
   181  		appURI := fmt.Sprintf("0.0.0.0:%d", p.autoManagePort)
   182  		p.cluster.Logger().Error("Automanaged server stopping..!", slog.Any("error", p.activeProvider.Start(appURI)))
   183  
   184  		activeProviderRunningMutex.Lock()
   185  		p.activeProviderRunning = false
   186  		activeProviderRunningMutex.Unlock()
   187  	}()
   188  }
   189  
   190  // MonitorMemberStatusChanges creates a go routine that continuously checks other members
   191  func (p *AutoManagedProvider) monitorMemberStatusChanges() {
   192  	if !p.monitoringStatus {
   193  		go func() {
   194  			for !p.isShutdown() && !p.isDeregistered() {
   195  				p.monitorStatuses()
   196  			}
   197  		}()
   198  	}
   199  	p.monitoringStatus = true
   200  }
   201  
   202  // GetHealthStatus returns an error if the cluster health status has problems
   203  func (p *AutoManagedProvider) GetHealthStatus() error {
   204  	var err error
   205  	clusterTTLErrorMutex.Lock()
   206  	clusterMonitorErrorMutex.Lock()
   207  	defer clusterMonitorErrorMutex.Unlock()
   208  	defer clusterTTLErrorMutex.Unlock()
   209  
   210  	if p.clusterTTLError != nil {
   211  		err = fmt.Errorf("TTL: %s", p.clusterTTLError.Error())
   212  	}
   213  
   214  	if p.clusterMonitorError != nil {
   215  		if err != nil {
   216  			err = fmt.Errorf("%s - Monitor: %s", err.Error(), p.clusterMonitorError.Error())
   217  		} else {
   218  			err = fmt.Errorf("monitor: %s", p.clusterMonitorError.Error())
   219  		}
   220  	}
   221  
   222  	return err
   223  }
   224  
   225  //
   226  // Private methods
   227  //
   228  
   229  // monitorStatuses checks for node changes in the cluster
   230  func (p *AutoManagedProvider) monitorStatuses() {
   231  	clusterMonitorErrorMutex.Lock()
   232  	defer clusterMonitorErrorMutex.Unlock()
   233  
   234  	autoManagedNodes, err := p.checkNodes()
   235  	if err != nil && len(autoManagedNodes) == 0 {
   236  		p.cluster.Logger().Error("Failure reaching nodes", slog.Any("error", err))
   237  		p.clusterMonitorError = err
   238  		time.Sleep(p.refreshTTL)
   239  		return
   240  	}
   241  	// we should probably check if the cluster needs to be updated.
   242  	var members []*cluster.Member
   243  	var newNodes []*NodeModel
   244  	for _, node := range autoManagedNodes {
   245  		if node == nil || node.ClusterName != p.clusterName {
   246  			continue
   247  		}
   248  		ms := &cluster.Member{
   249  			Id:    node.ID,
   250  			Host:  node.Address,
   251  			Port:  int32(node.Port),
   252  			Kinds: node.Kinds,
   253  		}
   254  		members = append(members, ms)
   255  		newNodes = append(newNodes, node)
   256  	}
   257  
   258  	p.knownNodes = newNodes
   259  	p.clusterMonitorError = nil
   260  	// publish the current cluster topology onto the event stream
   261  	p.cluster.MemberList.UpdateClusterTopology(members)
   262  	time.Sleep(p.refreshTTL)
   263  }
   264  
   265  // checkNodes pings all the nodes and returns the new cluster topology
   266  func (p *AutoManagedProvider) checkNodes() ([]*NodeModel, error) {
   267  	allNodes := make([]*NodeModel, len(p.hosts))
   268  	g, _ := errgroup.WithContext(context.Background())
   269  
   270  	for indice, nodeHost := range p.hosts {
   271  		idx, el := indice, nodeHost // https://golang.org/doc/faq#closures_and_goroutines
   272  
   273  		// Calling go funcs to execute the node check
   274  		g.Go(func() error {
   275  			url := fmt.Sprintf("http://%s/_health", el)
   276  			req, err := http.NewRequest("GET", url, nil)
   277  			if err != nil {
   278  				p.cluster.Logger().Error("Couldn't request node health status", slog.Any("error", err), slog.String("autoManMemberUrl", url))
   279  				return err
   280  			}
   281  
   282  			resp, err := p.httpClient.Do(req)
   283  			if err != nil {
   284  				p.cluster.Logger().Error("Bad connection to the node health status", slog.Any("error", err), slog.String("autoManMemberUrl", url))
   285  				return err
   286  			}
   287  
   288  			defer resp.Body.Close() // nolint: errcheck
   289  
   290  			if resp.StatusCode != http.StatusOK {
   291  				err = fmt.Errorf("non 200 status returned: %d - from node: %s", resp.StatusCode, el)
   292  				p.cluster.Logger().Error("Bad response from the node health status", slog.Any("error", err), slog.String("autoManMemberUrl", url))
   293  				return err
   294  			}
   295  
   296  			var node *NodeModel
   297  			err = json.NewDecoder(resp.Body).Decode(&node)
   298  			if err != nil {
   299  				err = fmt.Errorf("could not deserialize response: %v - from node: %s", resp, el)
   300  				p.cluster.Logger().Error("Bad data from the node health status", slog.Any("error", err), slog.String("autoManMemberUrl", url))
   301  				return err
   302  			}
   303  
   304  			allNodes[idx] = node
   305  			return nil
   306  		})
   307  	}
   308  
   309  	// waits until all functions have returned
   310  	err := g.Wait()
   311  	var retNodes []*NodeModel
   312  
   313  	// clear out the nil ones
   314  	for _, node := range allNodes {
   315  		if node != nil {
   316  			retNodes = append(retNodes, node)
   317  		}
   318  	}
   319  
   320  	return retNodes, err
   321  }
   322  
   323  func (p *AutoManagedProvider) deregisterService() {
   324  	deregisteredMutex.Lock()
   325  	defer deregisteredMutex.Unlock()
   326  
   327  	p.deregistered = true
   328  }
   329  
   330  func (p *AutoManagedProvider) startActiveProvider() {
   331  	activeProviderRunningMutex.Lock()
   332  	running := p.activeProviderRunning
   333  	activeProviderRunningMutex.Unlock()
   334  
   335  	if !running {
   336  		if !p.activeProviderTesting {
   337  			p.activeProvider = echo.New()
   338  			p.activeProvider.HideBanner = true
   339  			p.activeProvider.GET("/_health", func(context echo.Context) error {
   340  				return context.JSON(http.StatusOK, p.getCurrentNode())
   341  			})
   342  		}
   343  
   344  		appURI := fmt.Sprintf("0.0.0.0:%d", p.autoManagePort)
   345  
   346  		go func() {
   347  			activeProviderRunningMutex.Lock()
   348  			p.activeProviderRunning = true
   349  			activeProviderRunningMutex.Unlock()
   350  			err := p.activeProvider.Start(appURI)
   351  			p.cluster.Logger().Error("Automanaged server stopping..!", slog.Any("error", err))
   352  
   353  			activeProviderRunningMutex.Lock()
   354  			p.activeProviderRunning = false
   355  			activeProviderRunningMutex.Unlock()
   356  		}()
   357  	}
   358  }
   359  
   360  func (p *AutoManagedProvider) stopActiveProvider() {
   361  	p.activeProvider.Close()
   362  }
   363  
   364  func (p *AutoManagedProvider) isShutdown() bool {
   365  	shutdownMutex.Lock()
   366  	defer shutdownMutex.Unlock()
   367  	return p.shutdown
   368  }
   369  
   370  func (p *AutoManagedProvider) isDeregistered() bool {
   371  	deregisteredMutex.Lock()
   372  	defer deregisteredMutex.Unlock()
   373  	return p.deregistered
   374  }
   375  
   376  func (p *AutoManagedProvider) isActiveProviderRunning() bool {
   377  	activeProviderRunningMutex.Lock()
   378  	defer activeProviderRunningMutex.Unlock()
   379  	return p.activeProviderRunning
   380  }
   381  
   382  func (p *AutoManagedProvider) getCurrentNode() *NodeModel {
   383  	return NewNode(p.clusterName, p.cluster.ActorSystem.ID, p.address, p.memberPort, p.autoManagePort, p.knownKinds)
   384  }