github.com/cilium/cilium@v1.16.2/pkg/hubble/relay/observer/server.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright Authors of Cilium 3 4 package observer 5 6 import ( 7 "context" 8 "fmt" 9 10 "github.com/sirupsen/logrus" 11 "golang.org/x/sync/errgroup" 12 "google.golang.org/grpc/codes" 13 "google.golang.org/grpc/metadata" 14 grpcStatus "google.golang.org/grpc/status" 15 "google.golang.org/protobuf/types/known/wrapperspb" 16 17 observerpb "github.com/cilium/cilium/api/v1/observer" 18 relaypb "github.com/cilium/cilium/api/v1/relay" 19 "github.com/cilium/cilium/pkg/hubble/build" 20 "github.com/cilium/cilium/pkg/hubble/observer" 21 poolTypes "github.com/cilium/cilium/pkg/hubble/relay/pool/types" 22 "github.com/cilium/cilium/pkg/inctimer" 23 "github.com/cilium/cilium/pkg/lock" 24 ) 25 26 // numUnavailableNodesReportMax represents the maximum number of unavailable 27 // nodes that should be reported on ServerStatus call. The intent is not to be 28 // exhaustive when listing them as reporting all unavailable nodes might 29 // clutter in certain cases. 30 // Reporting up to 10 unavailable nodes is probably reasonable. 31 const numUnavailableNodesReportMax = 10 32 33 // PeerLister is the interface that wraps the List method. 34 type PeerLister interface { 35 // List returns a list of peers with active connections. If a peer cannot 36 // be connected to; its Conn attribute must be nil. 37 List() []poolTypes.Peer 38 } 39 40 // Server implements the observerpb.ObserverServer interface. 41 type Server struct { 42 opts options 43 peers PeerLister 44 } 45 46 // NewServer creates a new Server. 47 func NewServer(peers PeerLister, options ...Option) (*Server, error) { 48 opts := defaultOptions 49 for _, opt := range options { 50 if err := opt(&opts); err != nil { 51 return nil, fmt.Errorf("failed to apply option: %w", err) 52 } 53 } 54 return &Server{ 55 opts: opts, 56 peers: peers, 57 }, nil 58 } 59 60 // GetFlows implements observerpb.ObserverServer.GetFlows by proxying requests to 61 // the hubble instance the proxy is connected to. 62 func (s *Server) GetFlows(req *observerpb.GetFlowsRequest, stream observerpb.Observer_GetFlowsServer) error { 63 ctx := stream.Context() 64 md, ok := metadata.FromIncomingContext(ctx) 65 if ok { 66 ctx = metadata.NewOutgoingContext(ctx, md) 67 } 68 ctx, cancel := context.WithCancel(ctx) 69 defer cancel() 70 71 peers := s.peers.List() 72 qlen := s.opts.sortBufferMaxLen // we don't want to buffer too many flows 73 if nqlen := req.GetNumber() * uint64(len(peers)); nqlen > 0 && nqlen < uint64(qlen) { 74 // don't make the queue bigger than necessary as it would be a problem 75 // with the priority queue (we pop out when the queue is full) 76 qlen = int(nqlen) 77 } 78 79 g, gctx := errgroup.WithContext(ctx) 80 flows := make(chan *observerpb.GetFlowsResponse, qlen) 81 82 fc := newFlowCollector(req, s.opts) 83 connectedNodes, unavailableNodes := fc.collect(gctx, g, peers, flows) 84 85 if req.GetFollow() { 86 go func() { 87 updateTimer, updateTimerDone := inctimer.New() 88 defer updateTimerDone() 89 for { 90 select { 91 case <-updateTimer.After(s.opts.peerUpdateInterval): 92 peers := s.peers.List() 93 _, _ = fc.collect(gctx, g, peers, flows) 94 case <-gctx.Done(): 95 return 96 } 97 } 98 }() 99 } 100 go func() { 101 g.Wait() 102 close(flows) 103 }() 104 105 aggregated := aggregateErrors(ctx, flows, s.opts.errorAggregationWindow) 106 sortedFlows := sortFlows(ctx, aggregated, qlen, s.opts.sortBufferDrainTimeout) 107 108 // inform the client about the nodes from which we expect to receive flows first 109 if len(connectedNodes) > 0 { 110 status := nodeStatusEvent(relaypb.NodeState_NODE_CONNECTED, connectedNodes...) 111 if err := stream.Send(status); err != nil { 112 return err 113 } 114 } 115 if len(unavailableNodes) > 0 { 116 status := nodeStatusEvent(relaypb.NodeState_NODE_UNAVAILABLE, unavailableNodes...) 117 if err := stream.Send(status); err != nil { 118 return err 119 } 120 } 121 122 err := sendFlowsResponse(ctx, stream, sortedFlows) 123 if err != nil { 124 return err 125 } 126 return g.Wait() 127 } 128 129 // GetAgentEvents implements observerpb.ObserverServer.GetAgentEvents by proxying requests to 130 // the hubble instance the proxy is connected to. 131 func (s *Server) GetAgentEvents(req *observerpb.GetAgentEventsRequest, stream observerpb.Observer_GetAgentEventsServer) error { 132 return grpcStatus.Errorf(codes.Unimplemented, "GetAgentEvents not yet implemented") 133 } 134 135 // GetDebugEvents implements observerpb.ObserverServer.GetDebugEvents by proxying requests to 136 // the hubble instance the proxy is connected to. 137 func (s *Server) GetDebugEvents(req *observerpb.GetDebugEventsRequest, stream observerpb.Observer_GetDebugEventsServer) error { 138 return grpcStatus.Errorf(codes.Unimplemented, "GetDebugEvents not yet implemented") 139 } 140 141 // GetNodes implements observerpb.ObserverClient.GetNodes. 142 func (s *Server) GetNodes(ctx context.Context, req *observerpb.GetNodesRequest) (*observerpb.GetNodesResponse, error) { 143 if md, ok := metadata.FromIncomingContext(ctx); ok { 144 ctx = metadata.NewOutgoingContext(ctx, md) 145 } 146 ctx, cancel := context.WithCancel(ctx) 147 defer cancel() 148 g, ctx := errgroup.WithContext(ctx) 149 150 peers := s.peers.List() 151 nodes := make([]*observerpb.Node, 0, len(peers)) 152 for _, p := range peers { 153 n := &observerpb.Node{ 154 Name: p.Name, 155 Tls: &observerpb.TLS{ 156 Enabled: p.TLSEnabled, 157 ServerName: p.TLSServerName, 158 }, 159 } 160 if p.Address != nil { 161 n.Address = p.Address.String() 162 } 163 nodes = append(nodes, n) 164 if !isAvailable(p.Conn) { 165 n.State = relaypb.NodeState_NODE_UNAVAILABLE 166 s.opts.log.WithField("address", p.Address).Infof( 167 "No connection to peer %s, skipping", p.Name, 168 ) 169 continue 170 } 171 n.State = relaypb.NodeState_NODE_CONNECTED 172 g.Go(func() error { 173 n := n 174 client := s.opts.ocb.observerClient(&p) 175 status, err := client.ServerStatus(ctx, &observerpb.ServerStatusRequest{}) 176 if err != nil { 177 n.State = relaypb.NodeState_NODE_ERROR 178 s.opts.log.WithFields(logrus.Fields{ 179 "error": err, 180 "peer": p, 181 }).Warning("Failed to retrieve server status") 182 return nil 183 } 184 n.Version = status.GetVersion() 185 n.UptimeNs = status.GetUptimeNs() 186 n.MaxFlows = status.GetMaxFlows() 187 n.NumFlows = status.GetNumFlows() 188 n.SeenFlows = status.GetSeenFlows() 189 return nil 190 }) 191 } 192 if err := g.Wait(); err != nil { 193 return nil, err 194 } 195 return &observerpb.GetNodesResponse{Nodes: nodes}, nil 196 } 197 198 // GetNamespaces implements observerpb.ObserverClient.GetNamespaces. 199 func (s *Server) GetNamespaces(ctx context.Context, req *observerpb.GetNamespacesRequest) (*observerpb.GetNamespacesResponse, error) { 200 if md, ok := metadata.FromIncomingContext(ctx); ok { 201 ctx = metadata.NewOutgoingContext(ctx, md) 202 } 203 // We are not using errgroup.WithContext because we will return partial 204 // results over failing on the first error 205 g := new(errgroup.Group) 206 207 namespaceManager := observer.NewNamespaceManager() 208 209 for _, p := range s.peers.List() { 210 if !isAvailable(p.Conn) { 211 s.opts.log.WithField("address", p.Address).Infof( 212 "No connection to peer %s, skipping", p.Name, 213 ) 214 continue 215 } 216 217 g.Go(func() error { 218 client := s.opts.ocb.observerClient(&p) 219 nsResp, err := client.GetNamespaces(ctx, req) 220 if err != nil { 221 s.opts.log.WithFields(logrus.Fields{ 222 "error": err, 223 "peer": p, 224 }).Warning("Failed to retrieve namespaces") 225 return nil 226 } 227 for _, ns := range nsResp.GetNamespaces() { 228 namespaceManager.AddNamespace(ns) 229 } 230 return nil 231 }) 232 } 233 234 if err := g.Wait(); err != nil { 235 return nil, err 236 } 237 238 return &observerpb.GetNamespacesResponse{Namespaces: namespaceManager.GetNamespaces()}, nil 239 } 240 241 // ServerStatus implements observerpb.ObserverServer.ServerStatus by aggregating 242 // the ServerStatus answer of all hubble peers. 243 func (s *Server) ServerStatus(ctx context.Context, req *observerpb.ServerStatusRequest) (*observerpb.ServerStatusResponse, error) { 244 var ( 245 cancel context.CancelFunc 246 g *errgroup.Group 247 ) 248 md, ok := metadata.FromIncomingContext(ctx) 249 if ok { 250 ctx = metadata.NewOutgoingContext(ctx, md) 251 } 252 ctx, cancel = context.WithCancel(ctx) 253 defer cancel() 254 g, ctx = errgroup.WithContext(ctx) 255 256 peers := s.peers.List() 257 mu := lock.Mutex{} 258 numUnavailableNodes := 0 259 var unavailableNodes []string 260 statuses := make(chan *observerpb.ServerStatusResponse, len(peers)) 261 for _, p := range peers { 262 if !isAvailable(p.Conn) { 263 s.opts.log.WithField("address", p.Address).Infof( 264 "No connection to peer %s, skipping", p.Name, 265 ) 266 mu.Lock() 267 numUnavailableNodes++ 268 if len(unavailableNodes) < numUnavailableNodesReportMax { 269 unavailableNodes = append(unavailableNodes, p.Name) 270 } 271 mu.Unlock() 272 continue 273 } 274 275 g.Go(func() error { 276 client := s.opts.ocb.observerClient(&p) 277 status, err := client.ServerStatus(ctx, req) 278 if err != nil { 279 s.opts.log.WithFields(logrus.Fields{ 280 "error": err, 281 "peer": p, 282 }).Warning("Failed to retrieve server status") 283 mu.Lock() 284 numUnavailableNodes++ 285 if len(unavailableNodes) < numUnavailableNodesReportMax { 286 unavailableNodes = append(unavailableNodes, p.Name) 287 } 288 mu.Unlock() 289 return nil 290 } 291 select { 292 case statuses <- status: 293 case <-ctx.Done(): 294 } 295 return nil 296 }) 297 } 298 go func() { 299 g.Wait() 300 close(statuses) 301 }() 302 resp := &observerpb.ServerStatusResponse{ 303 Version: build.RelayVersion.String(), 304 } 305 for status := range statuses { 306 if status == nil { 307 continue 308 } 309 resp.MaxFlows += status.MaxFlows 310 resp.NumFlows += status.NumFlows 311 resp.SeenFlows += status.SeenFlows 312 // use the oldest uptime as a reference for the uptime as cumulating 313 // values would make little sense 314 if resp.UptimeNs < status.UptimeNs { 315 resp.UptimeNs = status.UptimeNs 316 } 317 resp.FlowsRate += status.FlowsRate 318 } 319 320 resp.NumConnectedNodes = &wrapperspb.UInt32Value{ 321 Value: uint32(len(peers) - numUnavailableNodes), 322 } 323 resp.NumUnavailableNodes = &wrapperspb.UInt32Value{ 324 Value: uint32(numUnavailableNodes), 325 } 326 resp.UnavailableNodes = unavailableNodes 327 328 return resp, g.Wait() 329 }