go.temporal.io/server@v1.23.0/common/membership/ringpop/service_resolver.go (about)

     1  // The MIT License
     2  //
     3  // Copyright (c) 2020 Temporal Technologies Inc.  All rights reserved.
     4  //
     5  // Copyright (c) 2020 Uber Technologies, Inc.
     6  //
     7  // Permission is hereby granted, free of charge, to any person obtaining a copy
     8  // of this software and associated documentation files (the "Software"), to deal
     9  // in the Software without restriction, including without limitation the rights
    10  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    11  // copies of the Software, and to permit persons to whom the Software is
    12  // furnished to do so, subject to the following conditions:
    13  //
    14  // The above copyright notice and this permission notice shall be included in
    15  // all copies or substantial portions of the Software.
    16  //
    17  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    18  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    19  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    20  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    21  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    22  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    23  // THE SOFTWARE.
    24  
    25  package ringpop
    26  
    27  import (
    28  	"errors"
    29  	"net"
    30  	"strconv"
    31  	"sync"
    32  	"sync/atomic"
    33  	"time"
    34  
    35  	"github.com/temporalio/ringpop-go"
    36  	"github.com/temporalio/tchannel-go"
    37  
    38  	"github.com/dgryski/go-farm"
    39  	"github.com/temporalio/ringpop-go/events"
    40  	"github.com/temporalio/ringpop-go/hashring"
    41  	"github.com/temporalio/ringpop-go/swim"
    42  
    43  	"go.temporal.io/server/common"
    44  	"go.temporal.io/server/common/log"
    45  	"go.temporal.io/server/common/log/tag"
    46  	"go.temporal.io/server/common/membership"
    47  	"go.temporal.io/server/common/primitives"
    48  	"go.temporal.io/server/common/util"
    49  )
    50  
    51  const (
    52  	// roleKey label is set by every single service as soon as it bootstraps its
    53  	// ringpop instance. The data for this key is the service name
    54  	roleKey = "serviceName"
    55  
    56  	// rolePort label is set by every single service as soon as it bootstraps its
    57  	// ringpop instance. The data for this key represents the TCP port through which
    58  	// the service can be accessed.
    59  	rolePort = "servicePort"
    60  
    61  	minRefreshInternal     = time.Second * 4
    62  	defaultRefreshInterval = time.Second * 10
    63  	replicaPoints          = 100
    64  )
    65  
    66  type serviceResolver struct {
    67  	service     primitives.ServiceName
    68  	port        int
    69  	rp          *ringpop.Ringpop
    70  	refreshChan chan struct{}
    71  	shutdownCh  chan struct{}
    72  	shutdownWG  sync.WaitGroup
    73  	logger      log.Logger
    74  
    75  	ringValue atomic.Value // this stores the current hashring
    76  
    77  	refreshLock     sync.Mutex
    78  	lastRefreshTime time.Time
    79  	membersMap      map[string]struct{} // for de-duping change notifications
    80  
    81  	listenerLock sync.RWMutex
    82  	listeners    map[string]chan<- *membership.ChangedEvent
    83  }
    84  
    85  var _ membership.ServiceResolver = (*serviceResolver)(nil)
    86  
    87  func newServiceResolver(
    88  	service primitives.ServiceName,
    89  	port int,
    90  	rp *ringpop.Ringpop,
    91  	logger log.Logger,
    92  ) *serviceResolver {
    93  	resolver := &serviceResolver{
    94  		service:     service,
    95  		port:        port,
    96  		rp:          rp,
    97  		refreshChan: make(chan struct{}),
    98  		shutdownCh:  make(chan struct{}),
    99  		logger:      log.With(logger, tag.ComponentServiceResolver, tag.Service(service)),
   100  		membersMap:  make(map[string]struct{}),
   101  		listeners:   make(map[string]chan<- *membership.ChangedEvent),
   102  	}
   103  	resolver.ringValue.Store(newHashRing())
   104  	return resolver
   105  }
   106  
   107  func newHashRing() *hashring.HashRing {
   108  	return hashring.New(farm.Fingerprint32, replicaPoints)
   109  }
   110  
   111  // Start starts the oracle
   112  func (r *serviceResolver) Start() {
   113  	r.rp.AddListener(r)
   114  	if err := r.refresh(); err != nil {
   115  		r.logger.Fatal("unable to start ring pop service resolver", tag.Error(err))
   116  	}
   117  
   118  	r.shutdownWG.Add(1)
   119  	go r.refreshRingWorker()
   120  }
   121  
   122  // Stop stops the resolver
   123  func (r *serviceResolver) Stop() {
   124  	r.listenerLock.Lock()
   125  	defer r.listenerLock.Unlock()
   126  	r.rp.RemoveListener(r)
   127  	r.ringValue.Store(newHashRing())
   128  	r.listeners = make(map[string]chan<- *membership.ChangedEvent)
   129  	close(r.shutdownCh)
   130  
   131  	if success := common.AwaitWaitGroup(&r.shutdownWG, time.Minute); !success {
   132  		r.logger.Warn("service resolver timed out on shutdown.")
   133  	}
   134  }
   135  
   136  func (r *serviceResolver) RequestRefresh() {
   137  	select {
   138  	case r.refreshChan <- struct{}{}:
   139  	default:
   140  	}
   141  }
   142  
   143  // Lookup finds the host in the ring responsible for serving the given key
   144  func (r *serviceResolver) Lookup(key string) (membership.HostInfo, error) {
   145  	addr, found := r.ring().Lookup(key)
   146  	if !found {
   147  		r.RequestRefresh()
   148  		return nil, membership.ErrInsufficientHosts
   149  	}
   150  
   151  	return newHostInfo(addr, r.getLabelsMap()), nil
   152  }
   153  
   154  func (r *serviceResolver) LookupN(key string, n int) []membership.HostInfo {
   155  	if n <= 0 {
   156  		return nil
   157  	}
   158  	addresses := r.ring().LookupN(key, n)
   159  	if len(addresses) == 0 {
   160  		r.RequestRefresh()
   161  		return nil
   162  	}
   163  	labels := r.getLabelsMap()
   164  	return util.MapSlice(addresses, func(address string) membership.HostInfo { return newHostInfo(address, labels) })
   165  }
   166  
   167  func (r *serviceResolver) AddListener(
   168  	name string,
   169  	notifyChannel chan<- *membership.ChangedEvent,
   170  ) error {
   171  	r.listenerLock.Lock()
   172  	defer r.listenerLock.Unlock()
   173  	_, ok := r.listeners[name]
   174  	if ok {
   175  		return membership.ErrListenerAlreadyExist
   176  	}
   177  	r.listeners[name] = notifyChannel
   178  	return nil
   179  }
   180  
   181  func (r *serviceResolver) RemoveListener(
   182  	name string,
   183  ) error {
   184  	r.listenerLock.Lock()
   185  	defer r.listenerLock.Unlock()
   186  	_, ok := r.listeners[name]
   187  	if !ok {
   188  		return nil
   189  	}
   190  	delete(r.listeners, name)
   191  	return nil
   192  }
   193  
   194  func (r *serviceResolver) MemberCount() int {
   195  	return r.ring().ServerCount()
   196  }
   197  
   198  func (r *serviceResolver) Members() []membership.HostInfo {
   199  	var servers []membership.HostInfo
   200  	for _, s := range r.ring().Servers() {
   201  		servers = append(servers, newHostInfo(s, r.getLabelsMap()))
   202  	}
   203  
   204  	return servers
   205  }
   206  
   207  // HandleEvent handles updates from ringpop
   208  func (r *serviceResolver) HandleEvent(
   209  	event events.Event,
   210  ) {
   211  	// We only care about RingChangedEvent
   212  	if _, ok := event.(events.RingChangedEvent); ok {
   213  		r.logger.Debug("Received a ring changed event")
   214  		// Note that we receive events asynchronously, possibly out of order.
   215  		// We cannot rely on the content of the event, rather we load everything
   216  		// from ringpop when we get a notification that something changed.
   217  		if err := r.refresh(); err != nil {
   218  			r.logger.Error("error refreshing ring when receiving a ring changed event", tag.Error(err))
   219  		}
   220  	}
   221  }
   222  
   223  func (r *serviceResolver) refresh() error {
   224  	var event *membership.ChangedEvent
   225  	var err error
   226  	defer func() {
   227  		if event != nil {
   228  			r.emitEvent(event)
   229  		}
   230  	}()
   231  	r.refreshLock.Lock()
   232  	defer r.refreshLock.Unlock()
   233  	event, err = r.refreshNoLock()
   234  	return err
   235  }
   236  
   237  func (r *serviceResolver) refreshWithBackoff() error {
   238  	var event *membership.ChangedEvent
   239  	var err error
   240  	defer func() {
   241  		if event != nil {
   242  			r.emitEvent(event)
   243  		}
   244  	}()
   245  	r.refreshLock.Lock()
   246  	defer r.refreshLock.Unlock()
   247  	if r.lastRefreshTime.After(time.Now().UTC().Add(-minRefreshInternal)) {
   248  		// refresh too frequently
   249  		return nil
   250  	}
   251  	event, err = r.refreshNoLock()
   252  	return err
   253  }
   254  
   255  func (r *serviceResolver) refreshNoLock() (*membership.ChangedEvent, error) {
   256  	addrs, err := r.getReachableMembers()
   257  	if err != nil {
   258  		return nil, err
   259  	}
   260  
   261  	newMembersMap, changedEvent := r.compareMembers(addrs)
   262  	if changedEvent == nil {
   263  		return nil, nil
   264  	}
   265  
   266  	ring := newHashRing()
   267  	for _, addr := range addrs {
   268  		host := newHostInfo(addr, r.getLabelsMap())
   269  		ring.AddMembers(host)
   270  	}
   271  
   272  	r.membersMap = newMembersMap
   273  	r.lastRefreshTime = time.Now().UTC()
   274  	r.ringValue.Store(ring)
   275  	r.logger.Info("Current reachable members", tag.Addresses(addrs))
   276  
   277  	return changedEvent, nil
   278  }
   279  
   280  func (r *serviceResolver) getReachableMembers() ([]string, error) {
   281  	members, err := r.rp.GetReachableMemberObjects(swim.MemberWithLabelAndValue(roleKey, string(r.service)))
   282  	if err != nil {
   283  		return nil, err
   284  	}
   285  
   286  	var hostPorts []string
   287  	for _, member := range members {
   288  		servicePort := r.port
   289  
   290  		// Each temporal service in the ring should advertise which port it has its gRPC listener
   291  		// on via a service label. If we cannot find the label, we will assume that the
   292  		// temporal service is listening on the same port that this node is listening on.
   293  		servicePortLabel, ok := member.Label(rolePort)
   294  		if ok {
   295  			servicePort, err = strconv.Atoi(servicePortLabel)
   296  			if err != nil {
   297  				return nil, err
   298  			}
   299  		} else {
   300  			r.logger.Debug("unable to find roleport label for ringpop member. using local service's port", tag.Service(r.service))
   301  		}
   302  
   303  		hostPort, err := replaceServicePort(member.Address, servicePort)
   304  		if err != nil {
   305  			return nil, err
   306  		}
   307  
   308  		hostPorts = append(hostPorts, hostPort)
   309  	}
   310  
   311  	return hostPorts, nil
   312  }
   313  
   314  func (r *serviceResolver) emitEvent(event *membership.ChangedEvent) {
   315  	// Notify listeners
   316  	r.listenerLock.RLock()
   317  	defer r.listenerLock.RUnlock()
   318  
   319  	for name, ch := range r.listeners {
   320  		select {
   321  		case ch <- event:
   322  		default:
   323  			r.logger.Error("Failed to send listener notification, channel full", tag.ListenerName(name))
   324  		}
   325  	}
   326  }
   327  
   328  func (r *serviceResolver) refreshRingWorker() {
   329  	defer r.shutdownWG.Done()
   330  
   331  	refreshTicker := time.NewTicker(defaultRefreshInterval)
   332  	defer refreshTicker.Stop()
   333  
   334  	for {
   335  		select {
   336  		case <-r.shutdownCh:
   337  			return
   338  		case <-r.refreshChan:
   339  			if err := r.refreshWithBackoff(); err != nil {
   340  				r.logger.Error("error refreshing ring by request", tag.Error(err))
   341  			}
   342  		case <-refreshTicker.C:
   343  			if err := r.refreshWithBackoff(); err != nil {
   344  				r.logger.Error("error periodically refreshing ring", tag.Error(err))
   345  			}
   346  		}
   347  	}
   348  }
   349  
   350  func (r *serviceResolver) ring() *hashring.HashRing {
   351  	return r.ringValue.Load().(*hashring.HashRing)
   352  }
   353  
   354  func (r *serviceResolver) getLabelsMap() map[string]string {
   355  	labels := make(map[string]string)
   356  	labels[roleKey] = string(r.service)
   357  	return labels
   358  }
   359  
   360  func (r *serviceResolver) compareMembers(addrs []string) (map[string]struct{}, *membership.ChangedEvent) {
   361  	event := &membership.ChangedEvent{}
   362  	changed := false
   363  	newMembersMap := make(map[string]struct{}, len(addrs))
   364  	for _, addr := range addrs {
   365  		newMembersMap[addr] = struct{}{}
   366  		if _, ok := r.membersMap[addr]; !ok {
   367  			event.HostsAdded = append(event.HostsAdded, newHostInfo(addr, r.getLabelsMap()))
   368  			changed = true
   369  		}
   370  	}
   371  	for addr := range r.membersMap {
   372  		if _, ok := newMembersMap[addr]; !ok {
   373  			event.HostsRemoved = append(event.HostsRemoved, newHostInfo(addr, r.getLabelsMap()))
   374  			changed = true
   375  		}
   376  	}
   377  	if changed {
   378  		return newMembersMap, event
   379  	}
   380  	return newMembersMap, nil
   381  }
   382  
   383  // buildBroadcastHostPort return the listener hostport from an existing tchannel
   384  // and overrides the address with broadcastAddress if specified
   385  func buildBroadcastHostPort(listenerPeerInfo tchannel.LocalPeerInfo, broadcastAddress string) (string, error) {
   386  	// Ephemeral port check copied from ringpop-go/ringpop.go/channelAddressResolver
   387  	// Check that TChannel is listening on a real hostport. By default,
   388  	// TChannel listens on an ephemeral host/port. The real port is then
   389  	// assigned by the OS when ListenAndServe is called. If the hostport is
   390  	// ephemeral, it means TChannel is not yet listening and the hostport
   391  	// cannot be resolved.
   392  	if listenerPeerInfo.IsEphemeralHostPort() {
   393  		return "", ringpop.ErrEphemeralAddress
   394  	}
   395  
   396  	// Parse listener hostport
   397  	listenerIPString, port, err := net.SplitHostPort(listenerPeerInfo.HostPort)
   398  	if err != nil {
   399  		return "", err
   400  	}
   401  
   402  	// Broadcast IP override
   403  	if broadcastAddress != "" {
   404  		// Parse supplied broadcastAddress override
   405  		ip := net.ParseIP(broadcastAddress)
   406  		if ip == nil {
   407  			return "", errors.New("broadcastAddress set but unknown failure encountered while parsing")
   408  		}
   409  
   410  		// If no errors, use the parsed IP with the port from our listener
   411  		return net.JoinHostPort(ip.String(), port), nil
   412  	}
   413  
   414  	listenerIP := net.ParseIP(listenerIPString)
   415  	if listenerIP == nil {
   416  		return "", errors.New("unable to parse listenerIP")
   417  	}
   418  
   419  	if listenerIP.IsUnspecified() {
   420  		return "", errors.New("broadcastAddress required when listening on all interfaces (0.0.0.0/[::])")
   421  	}
   422  
   423  	return listenerPeerInfo.HostPort, nil
   424  }