github.com/asynkron/protoactor-go@v0.0.0-20240308120642-ef91a6abee75/cluster/clusterproviders/automanaged/automanaged.go (about) 1 package automanaged 2 3 import ( 4 "encoding/json" 5 "fmt" 6 "log/slog" 7 "net" 8 "net/http" 9 "sync" 10 "time" 11 12 "golang.org/x/net/context" 13 14 "github.com/asynkron/protoactor-go/cluster" 15 "github.com/labstack/echo" 16 "golang.org/x/sync/errgroup" 17 ) 18 19 // TODO: needs to be attached to the provider instance 20 var ( 21 clusterTTLErrorMutex = new(sync.Mutex) 22 clusterMonitorErrorMutex = new(sync.Mutex) 23 shutdownMutex = new(sync.Mutex) 24 deregisteredMutex = new(sync.Mutex) 25 activeProviderMutex = new(sync.Mutex) 26 activeProviderRunningMutex = new(sync.Mutex) 27 ) 28 29 type AutoManagedProvider struct { 30 deregistered bool 31 shutdown bool 32 activeProvider *echo.Echo 33 activeProviderRunning bool 34 activeProviderTesting bool 35 httpClient *http.Client 36 monitoringStatus bool 37 clusterName string 38 address string 39 autoManagePort int 40 memberPort int 41 knownKinds []string 42 knownNodes []*NodeModel 43 hosts []string 44 refreshTTL time.Duration 45 clusterTTLError error 46 clusterMonitorError error 47 cluster *cluster.Cluster 48 } 49 50 // New creates a AutoManagedProvider that connects locally 51 func New() *AutoManagedProvider { 52 return NewWithConfig( 53 2*time.Second, 54 6330, 55 "localhost:6330", 56 ) 57 } 58 59 // NewWithConfig creates an Automanaged Provider that connects to an all the hosts 60 func NewWithConfig(refreshTTL time.Duration, autoManPort int, hosts ...string) *AutoManagedProvider { 61 transport := &http.Transport{ 62 Proxy: http.ProxyFromEnvironment, 63 DialContext: (&net.Dialer{ 64 Timeout: 5 * time.Second, 65 KeepAlive: 5 * time.Second, 66 }).DialContext, 67 MaxIdleConns: 10, 68 IdleConnTimeout: 90 * time.Second, 69 ExpectContinueTimeout: 1 * time.Second, 70 MaxConnsPerHost: 10, 71 } 72 73 httpClient := &http.Client{ 74 Transport: transport, 75 Timeout: 2 * time.Second, 76 } 77 78 p := &AutoManagedProvider{ 79 hosts: hosts, 80 httpClient: httpClient, 81 refreshTTL: refreshTTL, 82 autoManagePort: autoManPort, 83 activeProviderRunning: false, 84 monitoringStatus: false, 85 } 86 87 return p 88 } 89 90 // NewWithTesting creates a testable provider 91 func NewWithTesting(refreshTTL time.Duration, autoManPort int, activeProvider *echo.Echo, hosts ...string) *AutoManagedProvider { 92 p := NewWithConfig(refreshTTL, autoManPort, hosts...) 93 p.activeProviderTesting = true 94 p.activeProvider = activeProvider 95 return p 96 } 97 98 func (p *AutoManagedProvider) init(cluster *cluster.Cluster) error { 99 host, port, err := cluster.ActorSystem.GetHostPort() 100 if err != nil { 101 return err 102 } 103 104 p.clusterName = cluster.Config.Name 105 p.address = host 106 p.memberPort = port 107 p.knownKinds = cluster.GetClusterKinds() 108 p.deregistered = false 109 p.shutdown = false 110 p.cluster = cluster 111 return nil 112 } 113 114 func (p *AutoManagedProvider) StartMember(cluster *cluster.Cluster) error { 115 if err := p.init(cluster); err != nil { 116 return err 117 } 118 p.UpdateTTL() 119 p.monitorMemberStatusChanges() 120 return nil 121 } 122 123 func (p *AutoManagedProvider) StartClient(cluster *cluster.Cluster) error { 124 if err := p.init(cluster); err != nil { 125 return err 126 } 127 // p.UpdateTTL() 128 p.monitorMemberStatusChanges() 129 return nil 130 } 131 132 // DeregisterMember set the shutdown to true preventing anymore TTL updates 133 func (p *AutoManagedProvider) DeregisterMember() error { 134 deregisteredMutex.Lock() 135 defer deregisteredMutex.Unlock() 136 137 p.deregistered = true 138 return nil 139 } 140 141 // Shutdown set the shutdown to true preventing anymore TTL updates 142 func (p *AutoManagedProvider) Shutdown(graceful bool) error { 143 shutdownMutex.Lock() 144 defer shutdownMutex.Unlock() 145 146 p.shutdown = true 147 p.activeProvider.Close() 148 return nil 149 } 150 151 // UpdateTTL sets up an endpoint to respond to other members 152 func (p *AutoManagedProvider) UpdateTTL() { 153 activeProviderRunningMutex.Lock() 154 running := p.activeProviderRunning 155 activeProviderRunningMutex.Unlock() 156 157 if (p.isShutdown() || p.isDeregistered()) && running { 158 p.activeProvider.Close() 159 return 160 } 161 162 if running { 163 return 164 } 165 166 // it's not running, and it's not shutdown or de-registered 167 // it's also not a test (this should be refactored) 168 169 if !p.activeProviderTesting { 170 p.activeProvider = echo.New() 171 p.activeProvider.HideBanner = true 172 p.activeProvider.GET("/_health", func(context echo.Context) error { 173 return context.JSON(http.StatusOK, p.getCurrentNode()) 174 }) 175 } 176 go func() { 177 activeProviderRunningMutex.Lock() 178 p.activeProviderRunning = true 179 activeProviderRunningMutex.Unlock() 180 181 appURI := fmt.Sprintf("0.0.0.0:%d", p.autoManagePort) 182 p.cluster.Logger().Error("Automanaged server stopping..!", slog.Any("error", p.activeProvider.Start(appURI))) 183 184 activeProviderRunningMutex.Lock() 185 p.activeProviderRunning = false 186 activeProviderRunningMutex.Unlock() 187 }() 188 } 189 190 // MonitorMemberStatusChanges creates a go routine that continuously checks other members 191 func (p *AutoManagedProvider) monitorMemberStatusChanges() { 192 if !p.monitoringStatus { 193 go func() { 194 for !p.isShutdown() && !p.isDeregistered() { 195 p.monitorStatuses() 196 } 197 }() 198 } 199 p.monitoringStatus = true 200 } 201 202 // GetHealthStatus returns an error if the cluster health status has problems 203 func (p *AutoManagedProvider) GetHealthStatus() error { 204 var err error 205 clusterTTLErrorMutex.Lock() 206 clusterMonitorErrorMutex.Lock() 207 defer clusterMonitorErrorMutex.Unlock() 208 defer clusterTTLErrorMutex.Unlock() 209 210 if p.clusterTTLError != nil { 211 err = fmt.Errorf("TTL: %s", p.clusterTTLError.Error()) 212 } 213 214 if p.clusterMonitorError != nil { 215 if err != nil { 216 err = fmt.Errorf("%s - Monitor: %s", err.Error(), p.clusterMonitorError.Error()) 217 } else { 218 err = fmt.Errorf("monitor: %s", p.clusterMonitorError.Error()) 219 } 220 } 221 222 return err 223 } 224 225 // 226 // Private methods 227 // 228 229 // monitorStatuses checks for node changes in the cluster 230 func (p *AutoManagedProvider) monitorStatuses() { 231 clusterMonitorErrorMutex.Lock() 232 defer clusterMonitorErrorMutex.Unlock() 233 234 autoManagedNodes, err := p.checkNodes() 235 if err != nil && len(autoManagedNodes) == 0 { 236 p.cluster.Logger().Error("Failure reaching nodes", slog.Any("error", err)) 237 p.clusterMonitorError = err 238 time.Sleep(p.refreshTTL) 239 return 240 } 241 // we should probably check if the cluster needs to be updated. 242 var members []*cluster.Member 243 var newNodes []*NodeModel 244 for _, node := range autoManagedNodes { 245 if node == nil || node.ClusterName != p.clusterName { 246 continue 247 } 248 ms := &cluster.Member{ 249 Id: node.ID, 250 Host: node.Address, 251 Port: int32(node.Port), 252 Kinds: node.Kinds, 253 } 254 members = append(members, ms) 255 newNodes = append(newNodes, node) 256 } 257 258 p.knownNodes = newNodes 259 p.clusterMonitorError = nil 260 // publish the current cluster topology onto the event stream 261 p.cluster.MemberList.UpdateClusterTopology(members) 262 time.Sleep(p.refreshTTL) 263 } 264 265 // checkNodes pings all the nodes and returns the new cluster topology 266 func (p *AutoManagedProvider) checkNodes() ([]*NodeModel, error) { 267 allNodes := make([]*NodeModel, len(p.hosts)) 268 g, _ := errgroup.WithContext(context.Background()) 269 270 for indice, nodeHost := range p.hosts { 271 idx, el := indice, nodeHost // https://golang.org/doc/faq#closures_and_goroutines 272 273 // Calling go funcs to execute the node check 274 g.Go(func() error { 275 url := fmt.Sprintf("http://%s/_health", el) 276 req, err := http.NewRequest("GET", url, nil) 277 if err != nil { 278 p.cluster.Logger().Error("Couldn't request node health status", slog.Any("error", err), slog.String("autoManMemberUrl", url)) 279 return err 280 } 281 282 resp, err := p.httpClient.Do(req) 283 if err != nil { 284 p.cluster.Logger().Error("Bad connection to the node health status", slog.Any("error", err), slog.String("autoManMemberUrl", url)) 285 return err 286 } 287 288 defer resp.Body.Close() // nolint: errcheck 289 290 if resp.StatusCode != http.StatusOK { 291 err = fmt.Errorf("non 200 status returned: %d - from node: %s", resp.StatusCode, el) 292 p.cluster.Logger().Error("Bad response from the node health status", slog.Any("error", err), slog.String("autoManMemberUrl", url)) 293 return err 294 } 295 296 var node *NodeModel 297 err = json.NewDecoder(resp.Body).Decode(&node) 298 if err != nil { 299 err = fmt.Errorf("could not deserialize response: %v - from node: %s", resp, el) 300 p.cluster.Logger().Error("Bad data from the node health status", slog.Any("error", err), slog.String("autoManMemberUrl", url)) 301 return err 302 } 303 304 allNodes[idx] = node 305 return nil 306 }) 307 } 308 309 // waits until all functions have returned 310 err := g.Wait() 311 var retNodes []*NodeModel 312 313 // clear out the nil ones 314 for _, node := range allNodes { 315 if node != nil { 316 retNodes = append(retNodes, node) 317 } 318 } 319 320 return retNodes, err 321 } 322 323 func (p *AutoManagedProvider) deregisterService() { 324 deregisteredMutex.Lock() 325 defer deregisteredMutex.Unlock() 326 327 p.deregistered = true 328 } 329 330 func (p *AutoManagedProvider) startActiveProvider() { 331 activeProviderRunningMutex.Lock() 332 running := p.activeProviderRunning 333 activeProviderRunningMutex.Unlock() 334 335 if !running { 336 if !p.activeProviderTesting { 337 p.activeProvider = echo.New() 338 p.activeProvider.HideBanner = true 339 p.activeProvider.GET("/_health", func(context echo.Context) error { 340 return context.JSON(http.StatusOK, p.getCurrentNode()) 341 }) 342 } 343 344 appURI := fmt.Sprintf("0.0.0.0:%d", p.autoManagePort) 345 346 go func() { 347 activeProviderRunningMutex.Lock() 348 p.activeProviderRunning = true 349 activeProviderRunningMutex.Unlock() 350 err := p.activeProvider.Start(appURI) 351 p.cluster.Logger().Error("Automanaged server stopping..!", slog.Any("error", err)) 352 353 activeProviderRunningMutex.Lock() 354 p.activeProviderRunning = false 355 activeProviderRunningMutex.Unlock() 356 }() 357 } 358 } 359 360 func (p *AutoManagedProvider) stopActiveProvider() { 361 p.activeProvider.Close() 362 } 363 364 func (p *AutoManagedProvider) isShutdown() bool { 365 shutdownMutex.Lock() 366 defer shutdownMutex.Unlock() 367 return p.shutdown 368 } 369 370 func (p *AutoManagedProvider) isDeregistered() bool { 371 deregisteredMutex.Lock() 372 defer deregisteredMutex.Unlock() 373 return p.deregistered 374 } 375 376 func (p *AutoManagedProvider) isActiveProviderRunning() bool { 377 activeProviderRunningMutex.Lock() 378 defer activeProviderRunningMutex.Unlock() 379 return p.activeProviderRunning 380 } 381 382 func (p *AutoManagedProvider) getCurrentNode() *NodeModel { 383 return NewNode(p.clusterName, p.cluster.ActorSystem.ID, p.address, p.memberPort, p.autoManagePort, p.knownKinds) 384 }