github.com/cilium/cilium@v1.16.2/pkg/hubble/relay/observer/observer.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright Authors of Cilium 3 4 package observer 5 6 import ( 7 "context" 8 "errors" 9 "io" 10 11 "github.com/sirupsen/logrus" 12 "golang.org/x/sync/errgroup" 13 "google.golang.org/grpc/codes" 14 "google.golang.org/grpc/connectivity" 15 "google.golang.org/grpc/status" 16 "google.golang.org/protobuf/types/known/timestamppb" 17 18 observerpb "github.com/cilium/cilium/api/v1/observer" 19 relaypb "github.com/cilium/cilium/api/v1/relay" 20 poolTypes "github.com/cilium/cilium/pkg/hubble/relay/pool/types" 21 "github.com/cilium/cilium/pkg/hubble/relay/queue" 22 "github.com/cilium/cilium/pkg/inctimer" 23 "github.com/cilium/cilium/pkg/lock" 24 nodeTypes "github.com/cilium/cilium/pkg/node/types" 25 "github.com/cilium/cilium/pkg/time" 26 ) 27 28 func isAvailable(conn poolTypes.ClientConn) bool { 29 if conn == nil { 30 return false 31 } 32 state := conn.GetState() 33 return state != connectivity.TransientFailure && 34 state != connectivity.Shutdown 35 } 36 37 func retrieveFlowsFromPeer( 38 ctx context.Context, 39 client observerpb.ObserverClient, 40 req *observerpb.GetFlowsRequest, 41 flows chan<- *observerpb.GetFlowsResponse, 42 ) error { 43 c, err := client.GetFlows(ctx, req) 44 if err != nil { 45 return err 46 } 47 for { 48 flow, err := c.Recv() 49 if err != nil { 50 if errors.Is(err, io.EOF) || errors.Is(err, context.Canceled) { 51 return nil 52 } 53 if status.Code(err) == codes.Canceled { 54 return nil 55 } 56 return err 57 } 58 59 select { 60 case flows <- flow: 61 case <-ctx.Done(): 62 return nil 63 } 64 } 65 } 66 67 func sortFlows( 68 ctx context.Context, 69 flows <-chan *observerpb.GetFlowsResponse, 70 qlen int, 71 bufferDrainTimeout time.Duration, 72 ) <-chan *observerpb.GetFlowsResponse { 73 pq := queue.NewPriorityQueue(qlen) 74 sortedFlows := make(chan *observerpb.GetFlowsResponse, qlen) 75 76 go func() { 77 defer close(sortedFlows) 78 bufferTimer, bufferTimerDone := inctimer.New() 79 defer bufferTimerDone() 80 flowsLoop: 81 for { 82 select { 83 case flow, ok := <-flows: 84 if !ok { 85 break flowsLoop 86 } 87 if pq.Len() == qlen { 88 f := pq.Pop() 89 select { 90 case sortedFlows <- f: 91 case <-ctx.Done(): 92 return 93 } 94 } 95 pq.Push(flow) 96 case t := <-bufferTimer.After(bufferDrainTimeout): 97 // Make sure to drain old flows from the queue when no new 98 // flows are received. The bufferDrainTimeout duration is used 99 // as a sorting window. 100 for _, f := range pq.PopOlderThan(t.Add(-bufferDrainTimeout)) { 101 select { 102 case sortedFlows <- f: 103 case <-ctx.Done(): 104 return 105 } 106 } 107 case <-ctx.Done(): 108 return 109 } 110 } 111 // drain the queue 112 for f := pq.Pop(); f != nil; f = pq.Pop() { 113 select { 114 case sortedFlows <- f: 115 case <-ctx.Done(): 116 return 117 } 118 } 119 }() 120 return sortedFlows 121 } 122 123 func nodeStatusError(err error, nodeNames ...string) *observerpb.GetFlowsResponse { 124 msg := err.Error() 125 if s, ok := status.FromError(err); ok && s.Code() == codes.Unknown { 126 msg = s.Message() 127 } 128 129 return &observerpb.GetFlowsResponse{ 130 NodeName: nodeTypes.GetAbsoluteNodeName(), 131 Time: timestamppb.New(time.Now()), 132 ResponseTypes: &observerpb.GetFlowsResponse_NodeStatus{ 133 NodeStatus: &relaypb.NodeStatusEvent{ 134 StateChange: relaypb.NodeState_NODE_ERROR, 135 NodeNames: nodeNames, 136 Message: msg, 137 }, 138 }, 139 } 140 } 141 142 func nodeStatusEvent(state relaypb.NodeState, nodeNames ...string) *observerpb.GetFlowsResponse { 143 return &observerpb.GetFlowsResponse{ 144 NodeName: nodeTypes.GetAbsoluteNodeName(), 145 Time: timestamppb.New(time.Now()), 146 ResponseTypes: &observerpb.GetFlowsResponse_NodeStatus{ 147 NodeStatus: &relaypb.NodeStatusEvent{ 148 StateChange: state, 149 NodeNames: nodeNames, 150 }, 151 }, 152 } 153 } 154 155 func aggregateErrors( 156 ctx context.Context, 157 responses <-chan *observerpb.GetFlowsResponse, 158 errorAggregationWindow time.Duration, 159 ) <-chan *observerpb.GetFlowsResponse { 160 aggregated := make(chan *observerpb.GetFlowsResponse, cap(responses)) 161 162 var flushPending <-chan time.Time 163 var pendingResponse *observerpb.GetFlowsResponse 164 165 go func() { 166 defer close(aggregated) 167 aggregateErrorsLoop: 168 for { 169 select { 170 case response, ok := <-responses: 171 if !ok { 172 // flush any pending response before exiting 173 if pendingResponse != nil { 174 select { 175 case aggregated <- pendingResponse: 176 case <-ctx.Done(): 177 } 178 } 179 return 180 } 181 182 // any non-error responses are directly forwarded 183 current := response.GetNodeStatus() 184 if current.GetStateChange() != relaypb.NodeState_NODE_ERROR { 185 select { 186 case aggregated <- response: 187 continue aggregateErrorsLoop 188 case <-ctx.Done(): 189 return 190 } 191 } 192 193 // either merge with pending or flush it 194 if pending := pendingResponse.GetNodeStatus(); pending != nil { 195 if current.GetMessage() == pending.GetMessage() { 196 pending.NodeNames = append(pending.NodeNames, current.NodeNames...) 197 continue aggregateErrorsLoop 198 } 199 200 select { 201 case aggregated <- pendingResponse: 202 case <-ctx.Done(): 203 return 204 } 205 } 206 207 pendingResponse = response 208 flushPending = inctimer.After(errorAggregationWindow) 209 case <-flushPending: 210 select { 211 case aggregated <- pendingResponse: 212 pendingResponse = nil 213 flushPending = nil 214 case <-ctx.Done(): 215 return 216 } 217 case <-ctx.Done(): 218 return 219 } 220 } 221 222 }() 223 return aggregated 224 } 225 226 func sendFlowsResponse(ctx context.Context, stream observerpb.Observer_GetFlowsServer, sortedFlows <-chan *observerpb.GetFlowsResponse) error { 227 for { 228 select { 229 case flow, ok := <-sortedFlows: 230 if !ok { 231 return nil 232 } 233 if err := stream.Send(flow); err != nil { 234 return err 235 } 236 case <-ctx.Done(): 237 return nil 238 } 239 } 240 } 241 242 func newFlowCollector(req *observerpb.GetFlowsRequest, opts options) *flowCollector { 243 fc := &flowCollector{ 244 log: opts.log, 245 ocb: opts.ocb, 246 247 req: req, 248 249 connectedNodes: map[string]struct{}{}, 250 } 251 return fc 252 } 253 254 type flowCollector struct { 255 log logrus.FieldLogger 256 ocb observerClientBuilder 257 258 req *observerpb.GetFlowsRequest 259 260 mu lock.Mutex 261 connectedNodes map[string]struct{} 262 } 263 264 func (fc *flowCollector) collect(ctx context.Context, g *errgroup.Group, peers []poolTypes.Peer, flows chan *observerpb.GetFlowsResponse) ([]string, []string) { 265 var connected, unavailable []string 266 fc.mu.Lock() 267 defer fc.mu.Unlock() 268 for _, p := range peers { 269 if _, ok := fc.connectedNodes[p.Name]; ok { 270 connected = append(connected, p.Name) 271 continue 272 } 273 if !isAvailable(p.Conn) { 274 fc.log.WithField("address", p.Address).Infof( 275 "No connection to peer %s, skipping", p.Name, 276 ) 277 unavailable = append(unavailable, p.Name) 278 continue 279 } 280 connected = append(connected, p.Name) 281 fc.connectedNodes[p.Name] = struct{}{} 282 g.Go(func() error { 283 // retrieveFlowsFromPeer returns blocks until the peer finishes 284 // the request by closing the connection, an error occurs, 285 // or ctx expires. 286 err := retrieveFlowsFromPeer(ctx, fc.ocb.observerClient(&p), fc.req, flows) 287 if err != nil { 288 fc.log.WithFields(logrus.Fields{ 289 "error": err, 290 "peer": p, 291 }).Warning("Failed to retrieve flows from peer") 292 fc.mu.Lock() 293 delete(fc.connectedNodes, p.Name) 294 fc.mu.Unlock() 295 select { 296 case flows <- nodeStatusError(err, p.Name): 297 case <-ctx.Done(): 298 } 299 } 300 return nil 301 }) 302 } 303 return connected, unavailable 304 }