github.com/cilium/cilium@v1.16.2/pkg/hubble/relay/pool/manager.go (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright Authors of Cilium
     3  
     4  package pool
     5  
     6  import (
     7  	"context"
     8  	"fmt"
     9  	"sync"
    10  	"sync/atomic"
    11  
    12  	"github.com/prometheus/client_golang/prometheus"
    13  	"github.com/sirupsen/logrus"
    14  	"google.golang.org/grpc/connectivity"
    15  
    16  	peerpb "github.com/cilium/cilium/api/v1/peer"
    17  	peerTypes "github.com/cilium/cilium/pkg/hubble/peer/types"
    18  	poolTypes "github.com/cilium/cilium/pkg/hubble/relay/pool/types"
    19  	"github.com/cilium/cilium/pkg/inctimer"
    20  	"github.com/cilium/cilium/pkg/lock"
    21  	"github.com/cilium/cilium/pkg/time"
    22  )
    23  
    24  type peer struct {
    25  	mu lock.Mutex
    26  	peerTypes.Peer
    27  	conn            poolTypes.ClientConn
    28  	connAttempts    int
    29  	nextConnAttempt time.Time
    30  }
    31  
    32  // PeerManager manages a pool of peers (Peer) and associated gRPC connections.
    33  // Peers and peer change notifications are obtained from a peer gRPC service.
    34  type PeerManager struct {
    35  	opts                 options
    36  	updated              chan string
    37  	wg                   sync.WaitGroup
    38  	stop                 chan struct{}
    39  	peerServiceConnected atomic.Bool
    40  	mu                   lock.RWMutex
    41  	peers                map[string]*peer
    42  	metrics              *PoolMetrics
    43  }
    44  
    45  type Status struct {
    46  	PeerServiceConnected bool
    47  	AvailablePeers       int
    48  }
    49  
    50  // NewPeerManager creates a new manager that connects to a peer gRPC service to
    51  // manage peers and a connection to every peer's gRPC API.
    52  func NewPeerManager(registry prometheus.Registerer, options ...Option) (*PeerManager, error) {
    53  	opts := defaultOptions
    54  	for _, opt := range options {
    55  		if err := opt(&opts); err != nil {
    56  			return nil, fmt.Errorf("failed to apply option: %w", err)
    57  		}
    58  	}
    59  	metrics := NewPoolMetrics(registry)
    60  	return &PeerManager{
    61  		peers:                make(map[string]*peer),
    62  		updated:              make(chan string, 100),
    63  		stop:                 make(chan struct{}),
    64  		opts:                 opts,
    65  		metrics:              metrics,
    66  		peerServiceConnected: atomic.Bool{},
    67  	}, nil
    68  }
    69  
    70  // Start starts the manager.
    71  func (m *PeerManager) Start() {
    72  	m.wg.Add(3)
    73  	go func() {
    74  		defer m.wg.Done()
    75  		m.watchNotifications()
    76  	}()
    77  	go func() {
    78  		defer m.wg.Done()
    79  		m.manageConnections()
    80  	}()
    81  	go func() {
    82  		defer m.wg.Done()
    83  		m.reportConnectionStatus()
    84  	}()
    85  }
    86  
    87  func (m *PeerManager) watchNotifications() {
    88  	ctx, cancel := context.WithCancel(context.Background())
    89  	defer cancel()
    90  	go func() {
    91  		<-m.stop
    92  		cancel()
    93  	}()
    94  	retryTimer, retryTimerDone := inctimer.New()
    95  	defer retryTimerDone()
    96  connect:
    97  	for {
    98  		cl, err := m.opts.peerClientBuilder.Client(m.opts.peerServiceAddress)
    99  		if err != nil {
   100  			m.opts.log.WithFields(logrus.Fields{
   101  				"error":  err,
   102  				"target": m.opts.peerServiceAddress,
   103  			}).Warning("Failed to create peer client for peers synchronization; will try again after the timeout has expired")
   104  			select {
   105  			case <-m.stop:
   106  				return
   107  			case <-retryTimer.After(m.opts.retryTimeout):
   108  				continue
   109  			}
   110  		}
   111  		client, err := cl.Notify(ctx, &peerpb.NotifyRequest{})
   112  		if err != nil {
   113  			cl.Close()
   114  			m.opts.log.WithFields(logrus.Fields{
   115  				"error":              err,
   116  				"connection timeout": m.opts.retryTimeout,
   117  			}).Warning("Failed to create peer notify client for peers change notification; will try again after the timeout has expired")
   118  			select {
   119  			case <-m.stop:
   120  				return
   121  			case <-retryTimer.After(m.opts.retryTimeout):
   122  				continue
   123  			}
   124  		}
   125  		m.peerServiceConnected.Store(true)
   126  		for {
   127  			select {
   128  			case <-m.stop:
   129  				cl.Close()
   130  				return
   131  			default:
   132  			}
   133  			cn, err := client.Recv()
   134  			if err != nil {
   135  				cl.Close()
   136  				m.opts.log.WithFields(logrus.Fields{
   137  					"error":              err,
   138  					"connection timeout": m.opts.retryTimeout,
   139  				}).Warning("Error while receiving peer change notification; will try again after the timeout has expired")
   140  				m.peerServiceConnected.Store(false)
   141  				select {
   142  				case <-m.stop:
   143  					return
   144  				case <-retryTimer.After(m.opts.retryTimeout):
   145  					continue connect
   146  				}
   147  			}
   148  			m.opts.log.WithField("change notification", cn).Info("Received peer change notification")
   149  			p := peerTypes.FromChangeNotification(cn)
   150  			switch cn.GetType() {
   151  			case peerpb.ChangeNotificationType_PEER_ADDED:
   152  				m.upsert(p)
   153  			case peerpb.ChangeNotificationType_PEER_DELETED:
   154  				m.remove(p)
   155  			case peerpb.ChangeNotificationType_PEER_UPDATED:
   156  				m.upsert(p)
   157  			}
   158  		}
   159  	}
   160  }
   161  
   162  func (m *PeerManager) manageConnections() {
   163  	connTimer, connTimerDone := inctimer.New()
   164  	defer connTimerDone()
   165  	for {
   166  		select {
   167  		case <-m.stop:
   168  			return
   169  		case name := <-m.updated:
   170  			m.mu.RLock()
   171  			p := m.peers[name]
   172  			m.mu.RUnlock()
   173  			m.wg.Add(1)
   174  			go func(p *peer) {
   175  				defer m.wg.Done()
   176  				// a connection request has been made, make sure to attempt a connection
   177  				m.connect(p, true)
   178  			}(p)
   179  		case <-connTimer.After(m.opts.connCheckInterval):
   180  			m.mu.RLock()
   181  			for _, p := range m.peers {
   182  				m.wg.Add(1)
   183  				go func(p *peer) {
   184  					defer m.wg.Done()
   185  					m.connect(p, false)
   186  				}(p)
   187  			}
   188  			m.mu.RUnlock()
   189  		}
   190  	}
   191  }
   192  
   193  func (m *PeerManager) reportConnectionStatus() {
   194  	connTimer, connTimerDone := inctimer.New()
   195  	defer connTimerDone()
   196  	for {
   197  		select {
   198  		case <-m.stop:
   199  			return
   200  		case <-connTimer.After(m.opts.connStatusInterval):
   201  			m.mu.RLock()
   202  			connStates := make(map[connectivity.State]uint32)
   203  			var nilConnPeersNum uint32 = 0
   204  			for _, p := range m.peers {
   205  				p.mu.Lock()
   206  				if p.conn == nil {
   207  					nilConnPeersNum++
   208  					p.mu.Unlock()
   209  					continue
   210  				}
   211  				state := p.conn.GetState()
   212  				connStates[state] = connStates[state] + 1
   213  				p.mu.Unlock()
   214  			}
   215  			m.mu.RUnlock()
   216  			m.metrics.ObservePeerConnectionStatus(connStates, nilConnPeersNum)
   217  		}
   218  	}
   219  }
   220  
   221  // Stop stops the manager.
   222  func (m *PeerManager) Stop() {
   223  	close(m.stop)
   224  	m.wg.Wait()
   225  }
   226  
   227  // List implements observer.PeerLister.List.
   228  func (m *PeerManager) List() []poolTypes.Peer {
   229  	m.mu.RLock()
   230  	defer m.mu.RUnlock()
   231  	if len(m.peers) == 0 {
   232  		return nil
   233  	}
   234  	peers := make([]poolTypes.Peer, 0, len(m.peers))
   235  	for _, v := range m.peers {
   236  		// note: there shouldn't be null entries in the map
   237  		v.mu.Lock()
   238  		peers = append(peers, poolTypes.Peer{
   239  			Peer: peerTypes.Peer{
   240  				Name:          v.Name,
   241  				Address:       v.Address,
   242  				TLSEnabled:    v.TLSEnabled,
   243  				TLSServerName: v.TLSServerName,
   244  			},
   245  			Conn: v.conn,
   246  		})
   247  		v.mu.Unlock()
   248  	}
   249  	return peers
   250  }
   251  
   252  // Status provides the status of the manager
   253  func (m *PeerManager) Status() Status {
   254  	m.mu.RLock()
   255  	defer m.mu.RUnlock()
   256  	availablePeers := 0
   257  	for _, peer := range m.peers {
   258  		peer.mu.Lock()
   259  		if peer.conn != nil {
   260  			state := peer.conn.GetState()
   261  			if state != connectivity.TransientFailure && state != connectivity.Shutdown {
   262  				availablePeers++
   263  			}
   264  		}
   265  		peer.mu.Unlock()
   266  	}
   267  	return Status{
   268  		PeerServiceConnected: m.peerServiceConnected.Load(),
   269  		AvailablePeers:       availablePeers,
   270  	}
   271  }
   272  
   273  func (m *PeerManager) upsert(hp *peerTypes.Peer) {
   274  	if hp == nil {
   275  		return
   276  	}
   277  	m.mu.Lock()
   278  
   279  	p := m.peers[hp.Name]
   280  
   281  	if p != nil && p.Peer.Equal(*hp) {
   282  		// Nothing changed, we don't need to reconnect
   283  		m.mu.Unlock()
   284  		return
   285  	}
   286  
   287  	if p != nil {
   288  		// Close old connection
   289  		m.disconnect(p)
   290  	}
   291  	m.peers[hp.Name] = &peer{Peer: *hp}
   292  	m.mu.Unlock()
   293  	select {
   294  	case <-m.stop:
   295  	case m.updated <- hp.Name:
   296  	}
   297  }
   298  
   299  func (m *PeerManager) remove(hp *peerTypes.Peer) {
   300  	if hp == nil {
   301  		return
   302  	}
   303  	m.mu.Lock()
   304  	if p, ok := m.peers[hp.Name]; ok {
   305  		m.disconnect(p)
   306  		delete(m.peers, hp.Name)
   307  	}
   308  	m.mu.Unlock()
   309  }
   310  
   311  func (m *PeerManager) connect(p *peer, ignoreBackoff bool) {
   312  	if p == nil {
   313  		return
   314  	}
   315  	p.mu.Lock()
   316  	defer p.mu.Unlock()
   317  	if p.conn != nil && p.conn.GetState() != connectivity.Shutdown {
   318  		return // no need to attempt to connect
   319  	}
   320  
   321  	now := time.Now()
   322  	if p.Address == nil || (p.nextConnAttempt.After(now) && !ignoreBackoff) {
   323  		return
   324  	}
   325  
   326  	scopedLog := m.opts.log.WithFields(logrus.Fields{
   327  		"address":    p.Address,
   328  		"hubble-tls": p.TLSEnabled,
   329  		"peer":       p.Name,
   330  	})
   331  
   332  	scopedLog.Info("Connecting")
   333  	conn, err := m.opts.clientConnBuilder.ClientConn(p.Address.String(), p.TLSServerName)
   334  	if err != nil {
   335  		duration := m.opts.backoff.Duration(p.connAttempts)
   336  		p.nextConnAttempt = now.Add(duration)
   337  		p.connAttempts++
   338  		scopedLog.WithFields(logrus.Fields{
   339  			"error":       err,
   340  			"next-try-in": duration,
   341  		}).Warning("Failed to create gRPC client")
   342  		return
   343  	}
   344  	p.nextConnAttempt = time.Time{}
   345  	p.connAttempts = 0
   346  	p.conn = conn
   347  	scopedLog.Info("Connected")
   348  }
   349  
   350  func (m *PeerManager) disconnect(p *peer) {
   351  	if p == nil {
   352  		return
   353  	}
   354  	p.mu.Lock()
   355  	defer p.mu.Unlock()
   356  	if p.conn == nil {
   357  		return
   358  	}
   359  
   360  	scopedLog := m.opts.log.WithFields(logrus.Fields{
   361  		"address":    p.Address,
   362  		"hubble-tls": p.TLSEnabled,
   363  		"peer":       p.Name,
   364  	})
   365  
   366  	scopedLog.Info("Disconnecting")
   367  	if err := p.conn.Close(); err != nil {
   368  		scopedLog.WithField("error", err).Warning("Failed to properly close gRPC client connection")
   369  	}
   370  	p.conn = nil
   371  	scopedLog.Info("Disconnected")
   372  }