github.com/juju/juju@v0.0.0-20240327075706-a90865de2538/worker/pubsub/remoteserver.go (about)

     1  // Copyright 2016 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package pubsub
     5  
     6  import (
     7  	"fmt"
     8  	"sync"
     9  	"time"
    10  
    11  	"github.com/juju/clock"
    12  	"github.com/juju/collections/deque"
    13  	"github.com/juju/errors"
    14  	"github.com/juju/pubsub/v2"
    15  	"github.com/juju/retry"
    16  	"github.com/juju/worker/v3"
    17  	"gopkg.in/tomb.v2"
    18  
    19  	"github.com/juju/juju/api"
    20  	"github.com/juju/juju/pubsub/forwarder"
    21  	"github.com/juju/juju/rpc/params"
    22  )
    23  
    24  // RemoteServer represents the public interface of the worker
    25  // responsible for forwarding messages to a single other API server.
    26  type RemoteServer interface {
    27  	worker.Worker
    28  	Reporter
    29  	UpdateAddresses(addresses []string)
    30  	Publish(message *params.PubSubMessage)
    31  }
    32  
    33  // remoteServer is responsible for taking messages and sending them to the
    34  // pubsub endpoint on the remote server. If the connection is dropped, the
    35  // remoteServer will try to reconnect. Messages are not sent until the
    36  // connection either succeeds the first time, or fails to connect. Once there
    37  // is a failure, incoming messages are dropped until reconnection is complete,
    38  // then messages will flow again.
    39  type remoteServer struct {
    40  	origin string
    41  	target string
    42  	info   *api.Info
    43  	logger Logger
    44  
    45  	newWriter  func(*api.Info) (MessageWriter, error)
    46  	connection MessageWriter
    47  
    48  	hub   *pubsub.StructuredHub
    49  	tomb  tomb.Tomb
    50  	clock clock.Clock
    51  	mutex sync.Mutex
    52  
    53  	pending         *deque.Deque
    54  	data            chan struct{}
    55  	stopConnecting  chan struct{}
    56  	connectionReset chan struct{}
    57  	sent            uint64
    58  
    59  	unsubscribe func()
    60  }
    61  
    62  // RemoteServerConfig defines all the attributes that are needed for a RemoteServer.
    63  type RemoteServerConfig struct {
    64  	// Hub is used to publish connection messages
    65  	Hub    *pubsub.StructuredHub
    66  	Origin string
    67  	Target string
    68  	Clock  clock.Clock
    69  	Logger Logger
    70  
    71  	// APIInfo is initially populated with the addresses of the target machine.
    72  	APIInfo   *api.Info
    73  	NewWriter func(*api.Info) (MessageWriter, error)
    74  }
    75  
    76  // NewRemoteServer creates a new RemoteServer that will connect to the remote
    77  // apiserver and pass on messages to the pubsub endpoint of that apiserver.
    78  func NewRemoteServer(config RemoteServerConfig) (RemoteServer, error) {
    79  	remote := &remoteServer{
    80  		origin:    config.Origin,
    81  		target:    config.Target,
    82  		info:      config.APIInfo,
    83  		logger:    config.Logger,
    84  		newWriter: config.NewWriter,
    85  		hub:       config.Hub,
    86  		clock:     config.Clock,
    87  		pending:   deque.New(),
    88  		data:      make(chan struct{}),
    89  	}
    90  	unsub, err := remote.hub.Subscribe(forwarder.ConnectedTopic, remote.onForwarderConnection)
    91  	if err != nil {
    92  		return nil, errors.Trace(err)
    93  	}
    94  	remote.unsubscribe = unsub
    95  	remote.tomb.Go(remote.loop)
    96  	return remote, nil
    97  }
    98  
    99  // Report provides information to the engine report.
   100  // It should be fast and minimally blocking.
   101  func (r *remoteServer) Report() map[string]interface{} {
   102  	r.mutex.Lock()
   103  	defer r.mutex.Unlock()
   104  
   105  	var status string
   106  	if r.connection == nil {
   107  		status = "disconnected"
   108  	} else {
   109  		status = "connected"
   110  	}
   111  	result := map[string]interface{}{
   112  		"status": status,
   113  		"sent":   r.sent,
   114  	}
   115  	if r.info != nil {
   116  		result["addresses"] = r.info.Addrs
   117  	}
   118  	if r.pending != nil {
   119  		result["queue-len"] = r.pending.Len()
   120  	}
   121  	return result
   122  }
   123  
   124  // IntrospectionReport is the method called by the subscriber to get
   125  // information about this server.
   126  func (r *remoteServer) IntrospectionReport() string {
   127  	r.mutex.Lock()
   128  	defer r.mutex.Unlock()
   129  
   130  	var status string
   131  	if r.connection == nil {
   132  		status = "disconnected"
   133  	} else {
   134  		status = "connected"
   135  	}
   136  	return fmt.Sprintf(""+
   137  		"  Status: %s\n"+
   138  		"  Addresses: %v\n"+
   139  		"  Queue length: %d\n"+
   140  		"  Sent count: %d\n",
   141  		status, r.info.Addrs, r.pending.Len(), r.sent)
   142  }
   143  
   144  func (r *remoteServer) onForwarderConnection(topic string, details forwarder.OriginTarget, err error) {
   145  	if err != nil {
   146  		// This should never happen.
   147  		r.logger.Errorf("subscriber callback error: %v", err)
   148  		return
   149  	}
   150  	if details.Target == r.origin && details.Origin == r.target {
   151  		// If we have just been connected to by the apiserver that we are
   152  		// trying to connect to, interrupt any waiting we may be doing and try
   153  		// again as we may be in the middle of a long wait.
   154  		r.interruptConnecting()
   155  	}
   156  }
   157  
   158  // UpdateAddresses will update the addresses held for the target API server.
   159  // If we are currently trying to connect to the target, interrupt it so we
   160  // can try again with the new addresses.
   161  func (r *remoteServer) UpdateAddresses(addresses []string) {
   162  	r.mutex.Lock()
   163  	defer r.mutex.Unlock()
   164  
   165  	if r.connection == nil && r.stopConnecting != nil {
   166  		// We are probably trying to reconnect, so interrupt that so we don't
   167  		// get a race between setting addresses and trying to read them to
   168  		// connect. Note that we don't call the interruptConnecting method
   169  		// here because that method also tries to lock the mutex.
   170  		r.logger.Debugf("interrupting connecting due to new addresses: %v", addresses)
   171  		close(r.stopConnecting)
   172  		r.stopConnecting = nil
   173  	}
   174  	r.info.Addrs = addresses
   175  }
   176  
   177  // Publish queues up the message if and only if we have an active connection to
   178  // the target apiserver.
   179  func (r *remoteServer) Publish(message *params.PubSubMessage) {
   180  	select {
   181  	case <-r.tomb.Dying():
   182  		r.logger.Tracef("dying, don't send %q", message.Topic)
   183  	default:
   184  		r.mutex.Lock()
   185  		// Only queue the message up if we are currently connected.
   186  		notifyData := false
   187  		if r.connection != nil {
   188  			r.logger.Tracef("queue up topic %q", message.Topic)
   189  			r.pending.PushBack(message)
   190  			notifyData = r.pending.Len() == 1
   191  
   192  		} else {
   193  			r.logger.Tracef("skipping %q for %s as not connected", message.Topic, r.target)
   194  		}
   195  		r.mutex.Unlock()
   196  		if notifyData {
   197  			select {
   198  			case r.data <- struct{}{}:
   199  			case <-r.connectionReset:
   200  				r.logger.Debugf("connection reset while notifying %q for %s", message.Topic, r.target)
   201  			}
   202  		}
   203  	}
   204  }
   205  
   206  // nextMessage returns the next queued message, and a flag to indicate empty.
   207  func (r *remoteServer) nextMessage() *params.PubSubMessage {
   208  	r.mutex.Lock()
   209  	defer r.mutex.Unlock()
   210  	val, ok := r.pending.PopFront()
   211  	if !ok {
   212  		// nothing to do
   213  		return nil
   214  	}
   215  	// Even though it isn't exactly sent right now, it effectively will
   216  	// be very soon, and we want to keep this counter in the mutex lock.
   217  	r.sent++
   218  	return val.(*params.PubSubMessage)
   219  }
   220  
   221  func (r *remoteServer) connect() bool {
   222  	stop := make(chan struct{})
   223  	r.mutex.Lock()
   224  	r.stopConnecting = stop
   225  	r.mutex.Unlock()
   226  
   227  	var connection MessageWriter
   228  	r.logger.Debugf("connecting to %s", r.target)
   229  	_ = retry.Call(retry.CallArgs{
   230  		Func: func() error {
   231  			r.logger.Debugf("open api to %s: %v", r.target, r.info.Addrs)
   232  			conn, err := r.newWriter(r.info)
   233  			if err != nil {
   234  				r.logger.Tracef("unable to get message writer for %s, reconnecting... : %v\n%s", r.target, err, errors.ErrorStack(err))
   235  				return errors.Trace(err)
   236  			}
   237  			connection = conn
   238  			return nil
   239  		},
   240  		Attempts:    retry.UnlimitedAttempts,
   241  		Delay:       time.Second,
   242  		MaxDelay:    5 * time.Minute,
   243  		BackoffFunc: retry.DoubleDelay,
   244  		Stop:        stop,
   245  		Clock:       r.clock,
   246  	})
   247  
   248  	r.mutex.Lock()
   249  	r.stopConnecting = nil
   250  	defer r.mutex.Unlock()
   251  
   252  	if connection != nil {
   253  		r.connection = connection
   254  		r.connectionReset = make(chan struct{})
   255  		r.logger.Infof("forwarding connected %s -> %s", r.origin, r.target)
   256  		_, err := r.hub.Publish(
   257  			forwarder.ConnectedTopic,
   258  			// NOTE: origin is filled in by the the central hub annotations.
   259  			forwarder.OriginTarget{Target: r.target})
   260  		if err != nil {
   261  			r.logger.Errorf("%v", err)
   262  		}
   263  		return true
   264  	}
   265  	return false
   266  }
   267  
   268  func (r *remoteServer) loop() error {
   269  	defer r.unsubscribe()
   270  
   271  	var delay <-chan time.Time
   272  	messageToSend := make(chan *params.PubSubMessage)
   273  	messageSent := make(chan *params.PubSubMessage)
   274  	go r.forwardMessages(messageToSend, messageSent)
   275  
   276  	for {
   277  		if r.connection == nil {
   278  			// If we don't have a current connection, try to get one.
   279  			if r.connect() {
   280  				delay = nil
   281  			} else {
   282  				// Skip through the select to try to reconnect.
   283  				delay = r.clock.After(time.Second)
   284  			}
   285  		}
   286  
   287  		select {
   288  		case <-r.tomb.Dying():
   289  			r.logger.Debugf("worker shutting down")
   290  			r.resetConnection()
   291  			return tomb.ErrDying
   292  		case <-r.data:
   293  			// Has new data been pushed on?
   294  			r.logger.Tracef("new messages")
   295  		case <-delay:
   296  			// If we failed to connect for whatever reason, this means we don't cycle
   297  			// immediately.
   298  			r.logger.Tracef("connect delay")
   299  		}
   300  		r.logger.Tracef("send pending messages")
   301  		r.sendPendingMessages(messageToSend, messageSent)
   302  	}
   303  }
   304  
   305  func (r *remoteServer) sendPendingMessages(messageToSend chan<- *params.PubSubMessage, messageSent <-chan *params.PubSubMessage) {
   306  	for message := r.nextMessage(); message != nil; message = r.nextMessage() {
   307  		select {
   308  		case <-r.tomb.Dying():
   309  			return
   310  		case messageToSend <- message:
   311  			// Just in case the worker dies while we are trying to send.
   312  		}
   313  		select {
   314  		case <-r.tomb.Dying():
   315  			// This will cause the main loop to iterate around, and close
   316  			// the connection before returning.
   317  			return
   318  		case <-messageSent:
   319  			// continue on to next
   320  		}
   321  	}
   322  }
   323  
   324  func (r *remoteServer) resetConnection() {
   325  	r.mutex.Lock()
   326  	defer r.mutex.Unlock()
   327  	// If we have already been reset, just return
   328  	if r.connection == nil {
   329  		return
   330  	}
   331  	r.logger.Debugf("closing connection and clearing pending")
   332  	r.connection.Close()
   333  	r.connection = nil
   334  	close(r.connectionReset)
   335  	// Discard all pending messages.
   336  	r.pending = deque.New()
   337  	// Tell everyone what we have been disconnected.
   338  	_, err := r.hub.Publish(
   339  		forwarder.DisconnectedTopic,
   340  		// NOTE: origin is filled in by the the central hub annotations.
   341  		forwarder.OriginTarget{Target: r.target})
   342  	if err != nil {
   343  		r.logger.Errorf("%v", err)
   344  	}
   345  }
   346  
   347  // forwardMessages is a goroutine whose sole purpose is to get messages off
   348  // the messageToSend channel, try to send them over the API, and say when they
   349  // are done with this message. This allows for the potential blocking call of
   350  // `ForwardMessage`. If this does block for whatever reason and the worker is
   351  // asked to shutdown, the main loop method is able to do so. That would cause
   352  // the API connection to be closed, which would cause the `ForwardMessage` to
   353  // be unblocked due to the error of the socket closing.
   354  func (r *remoteServer) forwardMessages(messageToSend <-chan *params.PubSubMessage, messageSent chan<- *params.PubSubMessage) {
   355  	var message *params.PubSubMessage
   356  	for {
   357  		select {
   358  		case <-r.tomb.Dying():
   359  			return
   360  		case message = <-messageToSend:
   361  		}
   362  		r.mutex.Lock()
   363  		conn := r.connection
   364  		r.mutex.Unlock()
   365  
   366  		r.logger.Tracef("forwarding %q to %s, data %v", message.Topic, r.target, message.Data)
   367  		if conn != nil {
   368  			err := conn.ForwardMessage(message)
   369  			if err != nil {
   370  				// Some problem sending, so log, close the connection, and try to reconnect.
   371  				r.logger.Infof("unable to forward message, reconnecting... : %v", err)
   372  				r.resetConnection()
   373  			}
   374  		}
   375  
   376  		select {
   377  		case <-r.tomb.Dying():
   378  			return
   379  		case messageSent <- message:
   380  		}
   381  	}
   382  }
   383  
   384  func (r *remoteServer) interruptConnecting() {
   385  	r.mutex.Lock()
   386  	defer r.mutex.Unlock()
   387  	if r.stopConnecting != nil {
   388  		r.logger.Debugf("interrupting the pending connect loop")
   389  		close(r.stopConnecting)
   390  		r.stopConnecting = nil
   391  	}
   392  }
   393  
   394  // Kill is part of the worker.Worker interface.
   395  func (r *remoteServer) Kill() {
   396  	r.tomb.Kill(nil)
   397  	r.interruptConnecting()
   398  }
   399  
   400  // Wait is part of the worker.Worker interface.
   401  func (r *remoteServer) Wait() error {
   402  	return r.tomb.Wait()
   403  }