github.com/cilium/cilium@v1.16.2/pkg/hubble/relay/pool/manager.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright Authors of Cilium 3 4 package pool 5 6 import ( 7 "context" 8 "fmt" 9 "sync" 10 "sync/atomic" 11 12 "github.com/prometheus/client_golang/prometheus" 13 "github.com/sirupsen/logrus" 14 "google.golang.org/grpc/connectivity" 15 16 peerpb "github.com/cilium/cilium/api/v1/peer" 17 peerTypes "github.com/cilium/cilium/pkg/hubble/peer/types" 18 poolTypes "github.com/cilium/cilium/pkg/hubble/relay/pool/types" 19 "github.com/cilium/cilium/pkg/inctimer" 20 "github.com/cilium/cilium/pkg/lock" 21 "github.com/cilium/cilium/pkg/time" 22 ) 23 24 type peer struct { 25 mu lock.Mutex 26 peerTypes.Peer 27 conn poolTypes.ClientConn 28 connAttempts int 29 nextConnAttempt time.Time 30 } 31 32 // PeerManager manages a pool of peers (Peer) and associated gRPC connections. 33 // Peers and peer change notifications are obtained from a peer gRPC service. 34 type PeerManager struct { 35 opts options 36 updated chan string 37 wg sync.WaitGroup 38 stop chan struct{} 39 peerServiceConnected atomic.Bool 40 mu lock.RWMutex 41 peers map[string]*peer 42 metrics *PoolMetrics 43 } 44 45 type Status struct { 46 PeerServiceConnected bool 47 AvailablePeers int 48 } 49 50 // NewPeerManager creates a new manager that connects to a peer gRPC service to 51 // manage peers and a connection to every peer's gRPC API. 52 func NewPeerManager(registry prometheus.Registerer, options ...Option) (*PeerManager, error) { 53 opts := defaultOptions 54 for _, opt := range options { 55 if err := opt(&opts); err != nil { 56 return nil, fmt.Errorf("failed to apply option: %w", err) 57 } 58 } 59 metrics := NewPoolMetrics(registry) 60 return &PeerManager{ 61 peers: make(map[string]*peer), 62 updated: make(chan string, 100), 63 stop: make(chan struct{}), 64 opts: opts, 65 metrics: metrics, 66 peerServiceConnected: atomic.Bool{}, 67 }, nil 68 } 69 70 // Start starts the manager. 71 func (m *PeerManager) Start() { 72 m.wg.Add(3) 73 go func() { 74 defer m.wg.Done() 75 m.watchNotifications() 76 }() 77 go func() { 78 defer m.wg.Done() 79 m.manageConnections() 80 }() 81 go func() { 82 defer m.wg.Done() 83 m.reportConnectionStatus() 84 }() 85 } 86 87 func (m *PeerManager) watchNotifications() { 88 ctx, cancel := context.WithCancel(context.Background()) 89 defer cancel() 90 go func() { 91 <-m.stop 92 cancel() 93 }() 94 retryTimer, retryTimerDone := inctimer.New() 95 defer retryTimerDone() 96 connect: 97 for { 98 cl, err := m.opts.peerClientBuilder.Client(m.opts.peerServiceAddress) 99 if err != nil { 100 m.opts.log.WithFields(logrus.Fields{ 101 "error": err, 102 "target": m.opts.peerServiceAddress, 103 }).Warning("Failed to create peer client for peers synchronization; will try again after the timeout has expired") 104 select { 105 case <-m.stop: 106 return 107 case <-retryTimer.After(m.opts.retryTimeout): 108 continue 109 } 110 } 111 client, err := cl.Notify(ctx, &peerpb.NotifyRequest{}) 112 if err != nil { 113 cl.Close() 114 m.opts.log.WithFields(logrus.Fields{ 115 "error": err, 116 "connection timeout": m.opts.retryTimeout, 117 }).Warning("Failed to create peer notify client for peers change notification; will try again after the timeout has expired") 118 select { 119 case <-m.stop: 120 return 121 case <-retryTimer.After(m.opts.retryTimeout): 122 continue 123 } 124 } 125 m.peerServiceConnected.Store(true) 126 for { 127 select { 128 case <-m.stop: 129 cl.Close() 130 return 131 default: 132 } 133 cn, err := client.Recv() 134 if err != nil { 135 cl.Close() 136 m.opts.log.WithFields(logrus.Fields{ 137 "error": err, 138 "connection timeout": m.opts.retryTimeout, 139 }).Warning("Error while receiving peer change notification; will try again after the timeout has expired") 140 m.peerServiceConnected.Store(false) 141 select { 142 case <-m.stop: 143 return 144 case <-retryTimer.After(m.opts.retryTimeout): 145 continue connect 146 } 147 } 148 m.opts.log.WithField("change notification", cn).Info("Received peer change notification") 149 p := peerTypes.FromChangeNotification(cn) 150 switch cn.GetType() { 151 case peerpb.ChangeNotificationType_PEER_ADDED: 152 m.upsert(p) 153 case peerpb.ChangeNotificationType_PEER_DELETED: 154 m.remove(p) 155 case peerpb.ChangeNotificationType_PEER_UPDATED: 156 m.upsert(p) 157 } 158 } 159 } 160 } 161 162 func (m *PeerManager) manageConnections() { 163 connTimer, connTimerDone := inctimer.New() 164 defer connTimerDone() 165 for { 166 select { 167 case <-m.stop: 168 return 169 case name := <-m.updated: 170 m.mu.RLock() 171 p := m.peers[name] 172 m.mu.RUnlock() 173 m.wg.Add(1) 174 go func(p *peer) { 175 defer m.wg.Done() 176 // a connection request has been made, make sure to attempt a connection 177 m.connect(p, true) 178 }(p) 179 case <-connTimer.After(m.opts.connCheckInterval): 180 m.mu.RLock() 181 for _, p := range m.peers { 182 m.wg.Add(1) 183 go func(p *peer) { 184 defer m.wg.Done() 185 m.connect(p, false) 186 }(p) 187 } 188 m.mu.RUnlock() 189 } 190 } 191 } 192 193 func (m *PeerManager) reportConnectionStatus() { 194 connTimer, connTimerDone := inctimer.New() 195 defer connTimerDone() 196 for { 197 select { 198 case <-m.stop: 199 return 200 case <-connTimer.After(m.opts.connStatusInterval): 201 m.mu.RLock() 202 connStates := make(map[connectivity.State]uint32) 203 var nilConnPeersNum uint32 = 0 204 for _, p := range m.peers { 205 p.mu.Lock() 206 if p.conn == nil { 207 nilConnPeersNum++ 208 p.mu.Unlock() 209 continue 210 } 211 state := p.conn.GetState() 212 connStates[state] = connStates[state] + 1 213 p.mu.Unlock() 214 } 215 m.mu.RUnlock() 216 m.metrics.ObservePeerConnectionStatus(connStates, nilConnPeersNum) 217 } 218 } 219 } 220 221 // Stop stops the manager. 222 func (m *PeerManager) Stop() { 223 close(m.stop) 224 m.wg.Wait() 225 } 226 227 // List implements observer.PeerLister.List. 228 func (m *PeerManager) List() []poolTypes.Peer { 229 m.mu.RLock() 230 defer m.mu.RUnlock() 231 if len(m.peers) == 0 { 232 return nil 233 } 234 peers := make([]poolTypes.Peer, 0, len(m.peers)) 235 for _, v := range m.peers { 236 // note: there shouldn't be null entries in the map 237 v.mu.Lock() 238 peers = append(peers, poolTypes.Peer{ 239 Peer: peerTypes.Peer{ 240 Name: v.Name, 241 Address: v.Address, 242 TLSEnabled: v.TLSEnabled, 243 TLSServerName: v.TLSServerName, 244 }, 245 Conn: v.conn, 246 }) 247 v.mu.Unlock() 248 } 249 return peers 250 } 251 252 // Status provides the status of the manager 253 func (m *PeerManager) Status() Status { 254 m.mu.RLock() 255 defer m.mu.RUnlock() 256 availablePeers := 0 257 for _, peer := range m.peers { 258 peer.mu.Lock() 259 if peer.conn != nil { 260 state := peer.conn.GetState() 261 if state != connectivity.TransientFailure && state != connectivity.Shutdown { 262 availablePeers++ 263 } 264 } 265 peer.mu.Unlock() 266 } 267 return Status{ 268 PeerServiceConnected: m.peerServiceConnected.Load(), 269 AvailablePeers: availablePeers, 270 } 271 } 272 273 func (m *PeerManager) upsert(hp *peerTypes.Peer) { 274 if hp == nil { 275 return 276 } 277 m.mu.Lock() 278 279 p := m.peers[hp.Name] 280 281 if p != nil && p.Peer.Equal(*hp) { 282 // Nothing changed, we don't need to reconnect 283 m.mu.Unlock() 284 return 285 } 286 287 if p != nil { 288 // Close old connection 289 m.disconnect(p) 290 } 291 m.peers[hp.Name] = &peer{Peer: *hp} 292 m.mu.Unlock() 293 select { 294 case <-m.stop: 295 case m.updated <- hp.Name: 296 } 297 } 298 299 func (m *PeerManager) remove(hp *peerTypes.Peer) { 300 if hp == nil { 301 return 302 } 303 m.mu.Lock() 304 if p, ok := m.peers[hp.Name]; ok { 305 m.disconnect(p) 306 delete(m.peers, hp.Name) 307 } 308 m.mu.Unlock() 309 } 310 311 func (m *PeerManager) connect(p *peer, ignoreBackoff bool) { 312 if p == nil { 313 return 314 } 315 p.mu.Lock() 316 defer p.mu.Unlock() 317 if p.conn != nil && p.conn.GetState() != connectivity.Shutdown { 318 return // no need to attempt to connect 319 } 320 321 now := time.Now() 322 if p.Address == nil || (p.nextConnAttempt.After(now) && !ignoreBackoff) { 323 return 324 } 325 326 scopedLog := m.opts.log.WithFields(logrus.Fields{ 327 "address": p.Address, 328 "hubble-tls": p.TLSEnabled, 329 "peer": p.Name, 330 }) 331 332 scopedLog.Info("Connecting") 333 conn, err := m.opts.clientConnBuilder.ClientConn(p.Address.String(), p.TLSServerName) 334 if err != nil { 335 duration := m.opts.backoff.Duration(p.connAttempts) 336 p.nextConnAttempt = now.Add(duration) 337 p.connAttempts++ 338 scopedLog.WithFields(logrus.Fields{ 339 "error": err, 340 "next-try-in": duration, 341 }).Warning("Failed to create gRPC client") 342 return 343 } 344 p.nextConnAttempt = time.Time{} 345 p.connAttempts = 0 346 p.conn = conn 347 scopedLog.Info("Connected") 348 } 349 350 func (m *PeerManager) disconnect(p *peer) { 351 if p == nil { 352 return 353 } 354 p.mu.Lock() 355 defer p.mu.Unlock() 356 if p.conn == nil { 357 return 358 } 359 360 scopedLog := m.opts.log.WithFields(logrus.Fields{ 361 "address": p.Address, 362 "hubble-tls": p.TLSEnabled, 363 "peer": p.Name, 364 }) 365 366 scopedLog.Info("Disconnecting") 367 if err := p.conn.Close(); err != nil { 368 scopedLog.WithField("error", err).Warning("Failed to properly close gRPC client connection") 369 } 370 p.conn = nil 371 scopedLog.Info("Disconnected") 372 }