go.temporal.io/server@v1.23.0/common/membership/ringpop/service_resolver.go (about) 1 // The MIT License 2 // 3 // Copyright (c) 2020 Temporal Technologies Inc. All rights reserved. 4 // 5 // Copyright (c) 2020 Uber Technologies, Inc. 6 // 7 // Permission is hereby granted, free of charge, to any person obtaining a copy 8 // of this software and associated documentation files (the "Software"), to deal 9 // in the Software without restriction, including without limitation the rights 10 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 // copies of the Software, and to permit persons to whom the Software is 12 // furnished to do so, subject to the following conditions: 13 // 14 // The above copyright notice and this permission notice shall be included in 15 // all copies or substantial portions of the Software. 16 // 17 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23 // THE SOFTWARE. 24 25 package ringpop 26 27 import ( 28 "errors" 29 "net" 30 "strconv" 31 "sync" 32 "sync/atomic" 33 "time" 34 35 "github.com/temporalio/ringpop-go" 36 "github.com/temporalio/tchannel-go" 37 38 "github.com/dgryski/go-farm" 39 "github.com/temporalio/ringpop-go/events" 40 "github.com/temporalio/ringpop-go/hashring" 41 "github.com/temporalio/ringpop-go/swim" 42 43 "go.temporal.io/server/common" 44 "go.temporal.io/server/common/log" 45 "go.temporal.io/server/common/log/tag" 46 "go.temporal.io/server/common/membership" 47 "go.temporal.io/server/common/primitives" 48 "go.temporal.io/server/common/util" 49 ) 50 51 const ( 52 // roleKey label is set by every single service as soon as it bootstraps its 53 // ringpop instance. The data for this key is the service name 54 roleKey = "serviceName" 55 56 // rolePort label is set by every single service as soon as it bootstraps its 57 // ringpop instance. The data for this key represents the TCP port through which 58 // the service can be accessed. 59 rolePort = "servicePort" 60 61 minRefreshInternal = time.Second * 4 62 defaultRefreshInterval = time.Second * 10 63 replicaPoints = 100 64 ) 65 66 type serviceResolver struct { 67 service primitives.ServiceName 68 port int 69 rp *ringpop.Ringpop 70 refreshChan chan struct{} 71 shutdownCh chan struct{} 72 shutdownWG sync.WaitGroup 73 logger log.Logger 74 75 ringValue atomic.Value // this stores the current hashring 76 77 refreshLock sync.Mutex 78 lastRefreshTime time.Time 79 membersMap map[string]struct{} // for de-duping change notifications 80 81 listenerLock sync.RWMutex 82 listeners map[string]chan<- *membership.ChangedEvent 83 } 84 85 var _ membership.ServiceResolver = (*serviceResolver)(nil) 86 87 func newServiceResolver( 88 service primitives.ServiceName, 89 port int, 90 rp *ringpop.Ringpop, 91 logger log.Logger, 92 ) *serviceResolver { 93 resolver := &serviceResolver{ 94 service: service, 95 port: port, 96 rp: rp, 97 refreshChan: make(chan struct{}), 98 shutdownCh: make(chan struct{}), 99 logger: log.With(logger, tag.ComponentServiceResolver, tag.Service(service)), 100 membersMap: make(map[string]struct{}), 101 listeners: make(map[string]chan<- *membership.ChangedEvent), 102 } 103 resolver.ringValue.Store(newHashRing()) 104 return resolver 105 } 106 107 func newHashRing() *hashring.HashRing { 108 return hashring.New(farm.Fingerprint32, replicaPoints) 109 } 110 111 // Start starts the oracle 112 func (r *serviceResolver) Start() { 113 r.rp.AddListener(r) 114 if err := r.refresh(); err != nil { 115 r.logger.Fatal("unable to start ring pop service resolver", tag.Error(err)) 116 } 117 118 r.shutdownWG.Add(1) 119 go r.refreshRingWorker() 120 } 121 122 // Stop stops the resolver 123 func (r *serviceResolver) Stop() { 124 r.listenerLock.Lock() 125 defer r.listenerLock.Unlock() 126 r.rp.RemoveListener(r) 127 r.ringValue.Store(newHashRing()) 128 r.listeners = make(map[string]chan<- *membership.ChangedEvent) 129 close(r.shutdownCh) 130 131 if success := common.AwaitWaitGroup(&r.shutdownWG, time.Minute); !success { 132 r.logger.Warn("service resolver timed out on shutdown.") 133 } 134 } 135 136 func (r *serviceResolver) RequestRefresh() { 137 select { 138 case r.refreshChan <- struct{}{}: 139 default: 140 } 141 } 142 143 // Lookup finds the host in the ring responsible for serving the given key 144 func (r *serviceResolver) Lookup(key string) (membership.HostInfo, error) { 145 addr, found := r.ring().Lookup(key) 146 if !found { 147 r.RequestRefresh() 148 return nil, membership.ErrInsufficientHosts 149 } 150 151 return newHostInfo(addr, r.getLabelsMap()), nil 152 } 153 154 func (r *serviceResolver) LookupN(key string, n int) []membership.HostInfo { 155 if n <= 0 { 156 return nil 157 } 158 addresses := r.ring().LookupN(key, n) 159 if len(addresses) == 0 { 160 r.RequestRefresh() 161 return nil 162 } 163 labels := r.getLabelsMap() 164 return util.MapSlice(addresses, func(address string) membership.HostInfo { return newHostInfo(address, labels) }) 165 } 166 167 func (r *serviceResolver) AddListener( 168 name string, 169 notifyChannel chan<- *membership.ChangedEvent, 170 ) error { 171 r.listenerLock.Lock() 172 defer r.listenerLock.Unlock() 173 _, ok := r.listeners[name] 174 if ok { 175 return membership.ErrListenerAlreadyExist 176 } 177 r.listeners[name] = notifyChannel 178 return nil 179 } 180 181 func (r *serviceResolver) RemoveListener( 182 name string, 183 ) error { 184 r.listenerLock.Lock() 185 defer r.listenerLock.Unlock() 186 _, ok := r.listeners[name] 187 if !ok { 188 return nil 189 } 190 delete(r.listeners, name) 191 return nil 192 } 193 194 func (r *serviceResolver) MemberCount() int { 195 return r.ring().ServerCount() 196 } 197 198 func (r *serviceResolver) Members() []membership.HostInfo { 199 var servers []membership.HostInfo 200 for _, s := range r.ring().Servers() { 201 servers = append(servers, newHostInfo(s, r.getLabelsMap())) 202 } 203 204 return servers 205 } 206 207 // HandleEvent handles updates from ringpop 208 func (r *serviceResolver) HandleEvent( 209 event events.Event, 210 ) { 211 // We only care about RingChangedEvent 212 if _, ok := event.(events.RingChangedEvent); ok { 213 r.logger.Debug("Received a ring changed event") 214 // Note that we receive events asynchronously, possibly out of order. 215 // We cannot rely on the content of the event, rather we load everything 216 // from ringpop when we get a notification that something changed. 217 if err := r.refresh(); err != nil { 218 r.logger.Error("error refreshing ring when receiving a ring changed event", tag.Error(err)) 219 } 220 } 221 } 222 223 func (r *serviceResolver) refresh() error { 224 var event *membership.ChangedEvent 225 var err error 226 defer func() { 227 if event != nil { 228 r.emitEvent(event) 229 } 230 }() 231 r.refreshLock.Lock() 232 defer r.refreshLock.Unlock() 233 event, err = r.refreshNoLock() 234 return err 235 } 236 237 func (r *serviceResolver) refreshWithBackoff() error { 238 var event *membership.ChangedEvent 239 var err error 240 defer func() { 241 if event != nil { 242 r.emitEvent(event) 243 } 244 }() 245 r.refreshLock.Lock() 246 defer r.refreshLock.Unlock() 247 if r.lastRefreshTime.After(time.Now().UTC().Add(-minRefreshInternal)) { 248 // refresh too frequently 249 return nil 250 } 251 event, err = r.refreshNoLock() 252 return err 253 } 254 255 func (r *serviceResolver) refreshNoLock() (*membership.ChangedEvent, error) { 256 addrs, err := r.getReachableMembers() 257 if err != nil { 258 return nil, err 259 } 260 261 newMembersMap, changedEvent := r.compareMembers(addrs) 262 if changedEvent == nil { 263 return nil, nil 264 } 265 266 ring := newHashRing() 267 for _, addr := range addrs { 268 host := newHostInfo(addr, r.getLabelsMap()) 269 ring.AddMembers(host) 270 } 271 272 r.membersMap = newMembersMap 273 r.lastRefreshTime = time.Now().UTC() 274 r.ringValue.Store(ring) 275 r.logger.Info("Current reachable members", tag.Addresses(addrs)) 276 277 return changedEvent, nil 278 } 279 280 func (r *serviceResolver) getReachableMembers() ([]string, error) { 281 members, err := r.rp.GetReachableMemberObjects(swim.MemberWithLabelAndValue(roleKey, string(r.service))) 282 if err != nil { 283 return nil, err 284 } 285 286 var hostPorts []string 287 for _, member := range members { 288 servicePort := r.port 289 290 // Each temporal service in the ring should advertise which port it has its gRPC listener 291 // on via a service label. If we cannot find the label, we will assume that the 292 // temporal service is listening on the same port that this node is listening on. 293 servicePortLabel, ok := member.Label(rolePort) 294 if ok { 295 servicePort, err = strconv.Atoi(servicePortLabel) 296 if err != nil { 297 return nil, err 298 } 299 } else { 300 r.logger.Debug("unable to find roleport label for ringpop member. using local service's port", tag.Service(r.service)) 301 } 302 303 hostPort, err := replaceServicePort(member.Address, servicePort) 304 if err != nil { 305 return nil, err 306 } 307 308 hostPorts = append(hostPorts, hostPort) 309 } 310 311 return hostPorts, nil 312 } 313 314 func (r *serviceResolver) emitEvent(event *membership.ChangedEvent) { 315 // Notify listeners 316 r.listenerLock.RLock() 317 defer r.listenerLock.RUnlock() 318 319 for name, ch := range r.listeners { 320 select { 321 case ch <- event: 322 default: 323 r.logger.Error("Failed to send listener notification, channel full", tag.ListenerName(name)) 324 } 325 } 326 } 327 328 func (r *serviceResolver) refreshRingWorker() { 329 defer r.shutdownWG.Done() 330 331 refreshTicker := time.NewTicker(defaultRefreshInterval) 332 defer refreshTicker.Stop() 333 334 for { 335 select { 336 case <-r.shutdownCh: 337 return 338 case <-r.refreshChan: 339 if err := r.refreshWithBackoff(); err != nil { 340 r.logger.Error("error refreshing ring by request", tag.Error(err)) 341 } 342 case <-refreshTicker.C: 343 if err := r.refreshWithBackoff(); err != nil { 344 r.logger.Error("error periodically refreshing ring", tag.Error(err)) 345 } 346 } 347 } 348 } 349 350 func (r *serviceResolver) ring() *hashring.HashRing { 351 return r.ringValue.Load().(*hashring.HashRing) 352 } 353 354 func (r *serviceResolver) getLabelsMap() map[string]string { 355 labels := make(map[string]string) 356 labels[roleKey] = string(r.service) 357 return labels 358 } 359 360 func (r *serviceResolver) compareMembers(addrs []string) (map[string]struct{}, *membership.ChangedEvent) { 361 event := &membership.ChangedEvent{} 362 changed := false 363 newMembersMap := make(map[string]struct{}, len(addrs)) 364 for _, addr := range addrs { 365 newMembersMap[addr] = struct{}{} 366 if _, ok := r.membersMap[addr]; !ok { 367 event.HostsAdded = append(event.HostsAdded, newHostInfo(addr, r.getLabelsMap())) 368 changed = true 369 } 370 } 371 for addr := range r.membersMap { 372 if _, ok := newMembersMap[addr]; !ok { 373 event.HostsRemoved = append(event.HostsRemoved, newHostInfo(addr, r.getLabelsMap())) 374 changed = true 375 } 376 } 377 if changed { 378 return newMembersMap, event 379 } 380 return newMembersMap, nil 381 } 382 383 // buildBroadcastHostPort return the listener hostport from an existing tchannel 384 // and overrides the address with broadcastAddress if specified 385 func buildBroadcastHostPort(listenerPeerInfo tchannel.LocalPeerInfo, broadcastAddress string) (string, error) { 386 // Ephemeral port check copied from ringpop-go/ringpop.go/channelAddressResolver 387 // Check that TChannel is listening on a real hostport. By default, 388 // TChannel listens on an ephemeral host/port. The real port is then 389 // assigned by the OS when ListenAndServe is called. If the hostport is 390 // ephemeral, it means TChannel is not yet listening and the hostport 391 // cannot be resolved. 392 if listenerPeerInfo.IsEphemeralHostPort() { 393 return "", ringpop.ErrEphemeralAddress 394 } 395 396 // Parse listener hostport 397 listenerIPString, port, err := net.SplitHostPort(listenerPeerInfo.HostPort) 398 if err != nil { 399 return "", err 400 } 401 402 // Broadcast IP override 403 if broadcastAddress != "" { 404 // Parse supplied broadcastAddress override 405 ip := net.ParseIP(broadcastAddress) 406 if ip == nil { 407 return "", errors.New("broadcastAddress set but unknown failure encountered while parsing") 408 } 409 410 // If no errors, use the parsed IP with the port from our listener 411 return net.JoinHostPort(ip.String(), port), nil 412 } 413 414 listenerIP := net.ParseIP(listenerIPString) 415 if listenerIP == nil { 416 return "", errors.New("unable to parse listenerIP") 417 } 418 419 if listenerIP.IsUnspecified() { 420 return "", errors.New("broadcastAddress required when listening on all interfaces (0.0.0.0/[::])") 421 } 422 423 return listenerPeerInfo.HostPort, nil 424 }