github.com/juju/juju@v0.0.0-20240327075706-a90865de2538/worker/pubsub/remoteserver.go (about) 1 // Copyright 2016 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package pubsub 5 6 import ( 7 "fmt" 8 "sync" 9 "time" 10 11 "github.com/juju/clock" 12 "github.com/juju/collections/deque" 13 "github.com/juju/errors" 14 "github.com/juju/pubsub/v2" 15 "github.com/juju/retry" 16 "github.com/juju/worker/v3" 17 "gopkg.in/tomb.v2" 18 19 "github.com/juju/juju/api" 20 "github.com/juju/juju/pubsub/forwarder" 21 "github.com/juju/juju/rpc/params" 22 ) 23 24 // RemoteServer represents the public interface of the worker 25 // responsible for forwarding messages to a single other API server. 26 type RemoteServer interface { 27 worker.Worker 28 Reporter 29 UpdateAddresses(addresses []string) 30 Publish(message *params.PubSubMessage) 31 } 32 33 // remoteServer is responsible for taking messages and sending them to the 34 // pubsub endpoint on the remote server. If the connection is dropped, the 35 // remoteServer will try to reconnect. Messages are not sent until the 36 // connection either succeeds the first time, or fails to connect. Once there 37 // is a failure, incoming messages are dropped until reconnection is complete, 38 // then messages will flow again. 39 type remoteServer struct { 40 origin string 41 target string 42 info *api.Info 43 logger Logger 44 45 newWriter func(*api.Info) (MessageWriter, error) 46 connection MessageWriter 47 48 hub *pubsub.StructuredHub 49 tomb tomb.Tomb 50 clock clock.Clock 51 mutex sync.Mutex 52 53 pending *deque.Deque 54 data chan struct{} 55 stopConnecting chan struct{} 56 connectionReset chan struct{} 57 sent uint64 58 59 unsubscribe func() 60 } 61 62 // RemoteServerConfig defines all the attributes that are needed for a RemoteServer. 63 type RemoteServerConfig struct { 64 // Hub is used to publish connection messages 65 Hub *pubsub.StructuredHub 66 Origin string 67 Target string 68 Clock clock.Clock 69 Logger Logger 70 71 // APIInfo is initially populated with the addresses of the target machine. 72 APIInfo *api.Info 73 NewWriter func(*api.Info) (MessageWriter, error) 74 } 75 76 // NewRemoteServer creates a new RemoteServer that will connect to the remote 77 // apiserver and pass on messages to the pubsub endpoint of that apiserver. 78 func NewRemoteServer(config RemoteServerConfig) (RemoteServer, error) { 79 remote := &remoteServer{ 80 origin: config.Origin, 81 target: config.Target, 82 info: config.APIInfo, 83 logger: config.Logger, 84 newWriter: config.NewWriter, 85 hub: config.Hub, 86 clock: config.Clock, 87 pending: deque.New(), 88 data: make(chan struct{}), 89 } 90 unsub, err := remote.hub.Subscribe(forwarder.ConnectedTopic, remote.onForwarderConnection) 91 if err != nil { 92 return nil, errors.Trace(err) 93 } 94 remote.unsubscribe = unsub 95 remote.tomb.Go(remote.loop) 96 return remote, nil 97 } 98 99 // Report provides information to the engine report. 100 // It should be fast and minimally blocking. 101 func (r *remoteServer) Report() map[string]interface{} { 102 r.mutex.Lock() 103 defer r.mutex.Unlock() 104 105 var status string 106 if r.connection == nil { 107 status = "disconnected" 108 } else { 109 status = "connected" 110 } 111 result := map[string]interface{}{ 112 "status": status, 113 "sent": r.sent, 114 } 115 if r.info != nil { 116 result["addresses"] = r.info.Addrs 117 } 118 if r.pending != nil { 119 result["queue-len"] = r.pending.Len() 120 } 121 return result 122 } 123 124 // IntrospectionReport is the method called by the subscriber to get 125 // information about this server. 126 func (r *remoteServer) IntrospectionReport() string { 127 r.mutex.Lock() 128 defer r.mutex.Unlock() 129 130 var status string 131 if r.connection == nil { 132 status = "disconnected" 133 } else { 134 status = "connected" 135 } 136 return fmt.Sprintf(""+ 137 " Status: %s\n"+ 138 " Addresses: %v\n"+ 139 " Queue length: %d\n"+ 140 " Sent count: %d\n", 141 status, r.info.Addrs, r.pending.Len(), r.sent) 142 } 143 144 func (r *remoteServer) onForwarderConnection(topic string, details forwarder.OriginTarget, err error) { 145 if err != nil { 146 // This should never happen. 147 r.logger.Errorf("subscriber callback error: %v", err) 148 return 149 } 150 if details.Target == r.origin && details.Origin == r.target { 151 // If we have just been connected to by the apiserver that we are 152 // trying to connect to, interrupt any waiting we may be doing and try 153 // again as we may be in the middle of a long wait. 154 r.interruptConnecting() 155 } 156 } 157 158 // UpdateAddresses will update the addresses held for the target API server. 159 // If we are currently trying to connect to the target, interrupt it so we 160 // can try again with the new addresses. 161 func (r *remoteServer) UpdateAddresses(addresses []string) { 162 r.mutex.Lock() 163 defer r.mutex.Unlock() 164 165 if r.connection == nil && r.stopConnecting != nil { 166 // We are probably trying to reconnect, so interrupt that so we don't 167 // get a race between setting addresses and trying to read them to 168 // connect. Note that we don't call the interruptConnecting method 169 // here because that method also tries to lock the mutex. 170 r.logger.Debugf("interrupting connecting due to new addresses: %v", addresses) 171 close(r.stopConnecting) 172 r.stopConnecting = nil 173 } 174 r.info.Addrs = addresses 175 } 176 177 // Publish queues up the message if and only if we have an active connection to 178 // the target apiserver. 179 func (r *remoteServer) Publish(message *params.PubSubMessage) { 180 select { 181 case <-r.tomb.Dying(): 182 r.logger.Tracef("dying, don't send %q", message.Topic) 183 default: 184 r.mutex.Lock() 185 // Only queue the message up if we are currently connected. 186 notifyData := false 187 if r.connection != nil { 188 r.logger.Tracef("queue up topic %q", message.Topic) 189 r.pending.PushBack(message) 190 notifyData = r.pending.Len() == 1 191 192 } else { 193 r.logger.Tracef("skipping %q for %s as not connected", message.Topic, r.target) 194 } 195 r.mutex.Unlock() 196 if notifyData { 197 select { 198 case r.data <- struct{}{}: 199 case <-r.connectionReset: 200 r.logger.Debugf("connection reset while notifying %q for %s", message.Topic, r.target) 201 } 202 } 203 } 204 } 205 206 // nextMessage returns the next queued message, and a flag to indicate empty. 207 func (r *remoteServer) nextMessage() *params.PubSubMessage { 208 r.mutex.Lock() 209 defer r.mutex.Unlock() 210 val, ok := r.pending.PopFront() 211 if !ok { 212 // nothing to do 213 return nil 214 } 215 // Even though it isn't exactly sent right now, it effectively will 216 // be very soon, and we want to keep this counter in the mutex lock. 217 r.sent++ 218 return val.(*params.PubSubMessage) 219 } 220 221 func (r *remoteServer) connect() bool { 222 stop := make(chan struct{}) 223 r.mutex.Lock() 224 r.stopConnecting = stop 225 r.mutex.Unlock() 226 227 var connection MessageWriter 228 r.logger.Debugf("connecting to %s", r.target) 229 _ = retry.Call(retry.CallArgs{ 230 Func: func() error { 231 r.logger.Debugf("open api to %s: %v", r.target, r.info.Addrs) 232 conn, err := r.newWriter(r.info) 233 if err != nil { 234 r.logger.Tracef("unable to get message writer for %s, reconnecting... : %v\n%s", r.target, err, errors.ErrorStack(err)) 235 return errors.Trace(err) 236 } 237 connection = conn 238 return nil 239 }, 240 Attempts: retry.UnlimitedAttempts, 241 Delay: time.Second, 242 MaxDelay: 5 * time.Minute, 243 BackoffFunc: retry.DoubleDelay, 244 Stop: stop, 245 Clock: r.clock, 246 }) 247 248 r.mutex.Lock() 249 r.stopConnecting = nil 250 defer r.mutex.Unlock() 251 252 if connection != nil { 253 r.connection = connection 254 r.connectionReset = make(chan struct{}) 255 r.logger.Infof("forwarding connected %s -> %s", r.origin, r.target) 256 _, err := r.hub.Publish( 257 forwarder.ConnectedTopic, 258 // NOTE: origin is filled in by the the central hub annotations. 259 forwarder.OriginTarget{Target: r.target}) 260 if err != nil { 261 r.logger.Errorf("%v", err) 262 } 263 return true 264 } 265 return false 266 } 267 268 func (r *remoteServer) loop() error { 269 defer r.unsubscribe() 270 271 var delay <-chan time.Time 272 messageToSend := make(chan *params.PubSubMessage) 273 messageSent := make(chan *params.PubSubMessage) 274 go r.forwardMessages(messageToSend, messageSent) 275 276 for { 277 if r.connection == nil { 278 // If we don't have a current connection, try to get one. 279 if r.connect() { 280 delay = nil 281 } else { 282 // Skip through the select to try to reconnect. 283 delay = r.clock.After(time.Second) 284 } 285 } 286 287 select { 288 case <-r.tomb.Dying(): 289 r.logger.Debugf("worker shutting down") 290 r.resetConnection() 291 return tomb.ErrDying 292 case <-r.data: 293 // Has new data been pushed on? 294 r.logger.Tracef("new messages") 295 case <-delay: 296 // If we failed to connect for whatever reason, this means we don't cycle 297 // immediately. 298 r.logger.Tracef("connect delay") 299 } 300 r.logger.Tracef("send pending messages") 301 r.sendPendingMessages(messageToSend, messageSent) 302 } 303 } 304 305 func (r *remoteServer) sendPendingMessages(messageToSend chan<- *params.PubSubMessage, messageSent <-chan *params.PubSubMessage) { 306 for message := r.nextMessage(); message != nil; message = r.nextMessage() { 307 select { 308 case <-r.tomb.Dying(): 309 return 310 case messageToSend <- message: 311 // Just in case the worker dies while we are trying to send. 312 } 313 select { 314 case <-r.tomb.Dying(): 315 // This will cause the main loop to iterate around, and close 316 // the connection before returning. 317 return 318 case <-messageSent: 319 // continue on to next 320 } 321 } 322 } 323 324 func (r *remoteServer) resetConnection() { 325 r.mutex.Lock() 326 defer r.mutex.Unlock() 327 // If we have already been reset, just return 328 if r.connection == nil { 329 return 330 } 331 r.logger.Debugf("closing connection and clearing pending") 332 r.connection.Close() 333 r.connection = nil 334 close(r.connectionReset) 335 // Discard all pending messages. 336 r.pending = deque.New() 337 // Tell everyone what we have been disconnected. 338 _, err := r.hub.Publish( 339 forwarder.DisconnectedTopic, 340 // NOTE: origin is filled in by the the central hub annotations. 341 forwarder.OriginTarget{Target: r.target}) 342 if err != nil { 343 r.logger.Errorf("%v", err) 344 } 345 } 346 347 // forwardMessages is a goroutine whose sole purpose is to get messages off 348 // the messageToSend channel, try to send them over the API, and say when they 349 // are done with this message. This allows for the potential blocking call of 350 // `ForwardMessage`. If this does block for whatever reason and the worker is 351 // asked to shutdown, the main loop method is able to do so. That would cause 352 // the API connection to be closed, which would cause the `ForwardMessage` to 353 // be unblocked due to the error of the socket closing. 354 func (r *remoteServer) forwardMessages(messageToSend <-chan *params.PubSubMessage, messageSent chan<- *params.PubSubMessage) { 355 var message *params.PubSubMessage 356 for { 357 select { 358 case <-r.tomb.Dying(): 359 return 360 case message = <-messageToSend: 361 } 362 r.mutex.Lock() 363 conn := r.connection 364 r.mutex.Unlock() 365 366 r.logger.Tracef("forwarding %q to %s, data %v", message.Topic, r.target, message.Data) 367 if conn != nil { 368 err := conn.ForwardMessage(message) 369 if err != nil { 370 // Some problem sending, so log, close the connection, and try to reconnect. 371 r.logger.Infof("unable to forward message, reconnecting... : %v", err) 372 r.resetConnection() 373 } 374 } 375 376 select { 377 case <-r.tomb.Dying(): 378 return 379 case messageSent <- message: 380 } 381 } 382 } 383 384 func (r *remoteServer) interruptConnecting() { 385 r.mutex.Lock() 386 defer r.mutex.Unlock() 387 if r.stopConnecting != nil { 388 r.logger.Debugf("interrupting the pending connect loop") 389 close(r.stopConnecting) 390 r.stopConnecting = nil 391 } 392 } 393 394 // Kill is part of the worker.Worker interface. 395 func (r *remoteServer) Kill() { 396 r.tomb.Kill(nil) 397 r.interruptConnecting() 398 } 399 400 // Wait is part of the worker.Worker interface. 401 func (r *remoteServer) Wait() error { 402 return r.tomb.Wait() 403 }