github.com/cilium/cilium@v1.16.2/pkg/hubble/recorder/service.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright Authors of Cilium 3 4 package recorder 5 6 import ( 7 "context" 8 "errors" 9 "fmt" 10 "math/rand/v2" 11 "net" 12 "os" 13 "path" 14 "regexp" 15 16 "github.com/sirupsen/logrus" 17 "google.golang.org/protobuf/types/known/timestamppb" 18 19 recorderpb "github.com/cilium/cilium/api/v1/recorder" 20 "github.com/cilium/cilium/pkg/cidr" 21 "github.com/cilium/cilium/pkg/hubble/recorder/pcap" 22 "github.com/cilium/cilium/pkg/hubble/recorder/recorderoption" 23 "github.com/cilium/cilium/pkg/hubble/recorder/sink" 24 "github.com/cilium/cilium/pkg/idpool" 25 "github.com/cilium/cilium/pkg/logging" 26 "github.com/cilium/cilium/pkg/logging/logfields" 27 nodeTypes "github.com/cilium/cilium/pkg/node/types" 28 "github.com/cilium/cilium/pkg/recorder" 29 "github.com/cilium/cilium/pkg/time" 30 "github.com/cilium/cilium/pkg/u8proto" 31 ) 32 33 var log = logging.DefaultLogger.WithField(logfields.LogSubsys, "hubble-recorder") 34 35 var _ recorderpb.RecorderServer = (*Service)(nil) 36 37 const ( 38 minRuleID = 1 39 maxRuleID = 65534 40 41 defaultFileSinkPrefix = "hubble" 42 ) 43 44 type Service struct { 45 recorder *recorder.Recorder 46 dispatch *sink.Dispatch 47 ruleIDs *idpool.IDPool 48 opts recorderoption.Options 49 } 50 51 func NewService(r *recorder.Recorder, d *sink.Dispatch, options ...recorderoption.Option) (*Service, error) { 52 opts := recorderoption.Default 53 for _, o := range options { 54 if err := o(&opts); err != nil { 55 return nil, err 56 } 57 } 58 59 if len(opts.StoragePath) == 0 { 60 return nil, errors.New("storage path must not be empty") 61 } 62 63 if err := os.MkdirAll(opts.StoragePath, 0600); err != nil { 64 return nil, fmt.Errorf("failed to create storage path directory: %w", err) 65 } 66 67 return &Service{ 68 recorder: r, 69 dispatch: d, 70 ruleIDs: idpool.NewIDPool(minRuleID, maxRuleID), 71 opts: opts, 72 }, nil 73 } 74 75 func recordingStoppedResponse(stats sink.Statistics, filePath string) *recorderpb.RecordResponse { 76 return &recorderpb.RecordResponse{ 77 NodeName: nodeTypes.GetAbsoluteNodeName(), 78 Time: timestamppb.Now(), 79 ResponseType: &recorderpb.RecordResponse_Stopped{ 80 Stopped: &recorderpb.RecordingStoppedResponse{ 81 Stats: &recorderpb.RecordingStatistics{ 82 BytesCaptured: stats.BytesWritten, 83 PacketsCaptured: stats.PacketsWritten, 84 BytesLost: stats.BytesLost, 85 PacketsLost: stats.PacketsLost, 86 }, 87 Filesink: &recorderpb.FileSinkResult{ 88 FilePath: filePath, 89 }, 90 }, 91 }, 92 } 93 } 94 95 func recordingRunningResponse(stats sink.Statistics) *recorderpb.RecordResponse { 96 return &recorderpb.RecordResponse{ 97 NodeName: nodeTypes.GetAbsoluteNodeName(), 98 Time: timestamppb.Now(), 99 ResponseType: &recorderpb.RecordResponse_Running{ 100 Running: &recorderpb.RecordingRunningResponse{ 101 Stats: &recorderpb.RecordingStatistics{ 102 BytesCaptured: stats.BytesWritten, 103 PacketsCaptured: stats.PacketsWritten, 104 BytesLost: stats.BytesLost, 105 PacketsLost: stats.PacketsLost, 106 }, 107 }, 108 }, 109 } 110 } 111 112 func (s *Service) Record(stream recorderpb.Recorder_RecordServer) error { 113 ctx, cancel := context.WithCancel(stream.Context()) 114 defer cancel() 115 116 // Spawn a goroutine that forwards any received messages in order to be 117 // able to use select on it 118 reqCh := make(chan *recorderpb.RecordRequest) 119 errCh := make(chan error, 1) 120 go func() { 121 for { 122 req, err := stream.Recv() 123 if err != nil { 124 errCh <- fmt.Errorf("failed to receive from recorder client: %w", err) 125 return 126 } 127 128 select { 129 case reqCh <- req: 130 case <-ctx.Done(): 131 return 132 } 133 } 134 }() 135 136 var ( 137 recording *sink.Handle 138 filePath string 139 err error 140 ) 141 142 // Wait for the initial StartRecording message 143 select { 144 case req := <-reqCh: 145 startRecording := req.GetStart() 146 if startRecording == nil { 147 return fmt.Errorf("received invalid request %q, expected start request", req) 148 } 149 150 // The startRecording helper spawns a clean up goroutine to remove all 151 // state associated with this recording when the context ctx is cancelled. 152 recording, filePath, err = s.startRecording(ctx, startRecording) 153 if err != nil { 154 return err 155 } 156 case err = <-errCh: 157 return err 158 case <-ctx.Done(): 159 return ctx.Err() 160 } 161 162 // Send back a confirmation that the recording has started 163 err = stream.Send(recordingRunningResponse(recording.Stats())) 164 if err != nil { 165 return fmt.Errorf("failed to confirmation response: %w", err) 166 } 167 168 for { 169 select { 170 // This case happens when the client has sent us a new request. 171 // We expect a start request if recording is nil, and a stop request 172 // otherwise. 173 case req := <-reqCh: 174 if req.GetStop() != nil { 175 recording.Stop() 176 } else { 177 return fmt.Errorf("received invalid request %q, expected stop request", req) 178 } 179 // This case is hit whenever the recording has updated the statistics (i.e. 180 // packets have been captured). We fetch the latest statistics and forward 181 // them to the client 182 case <-recording.StatsUpdated: 183 err = stream.Send(recordingRunningResponse(recording.Stats())) 184 if err != nil { 185 return fmt.Errorf("failed to send recording running response: %w", err) 186 } 187 // This case happens when the recording has stopped (i.e. due to the above 188 // explicit shutdown or because an error has occurred). If no error has 189 // occurred, we assemble the final RecordingStoppedResponse and exit. 190 // If an error occurred, we propagate it by returning it from this stub. 191 case <-recording.Done: 192 err = recording.Err() 193 if err != nil { 194 return fmt.Errorf("recorder recording error: %w", err) 195 } 196 197 err = stream.Send(recordingStoppedResponse(recording.Stats(), filePath)) 198 if err != nil { 199 return fmt.Errorf("failed to send recording stopped response: %w", err) 200 } 201 202 return nil 203 // The following two cases happen when the client stream is either 204 // closed or cancelled. Simply return an error such that it is logged, 205 // and exit. 206 case err = <-errCh: 207 return err 208 case <-ctx.Done(): 209 return ctx.Err() 210 } 211 } 212 } 213 214 const fileExistsRetries = 100 215 216 var allowedFileChars = regexp.MustCompile("[^a-zA-Z0-9_.-]") 217 218 func createPcapFile(basedir, prefix string) (f *os.File, filePath string, err error) { 219 try := 0 220 for { 221 startTime := time.Now().Unix() 222 random := rand.Uint32() 223 nodeName := nodeTypes.GetAbsoluteNodeName() 224 name := fmt.Sprintf("%s_%d_%d_%s.pcap", prefix, startTime, random, nodeName) 225 sanitizedName := allowedFileChars.ReplaceAllLiteralString(name, "_") 226 filePath = path.Join(basedir, sanitizedName) 227 f, err = os.OpenFile(filePath, os.O_RDWR|os.O_CREATE|os.O_EXCL, 0600) 228 if err != nil { 229 if os.IsExist(err) { 230 if try++; try < fileExistsRetries { 231 continue 232 } 233 } 234 return f, "", fmt.Errorf("failed to create pcap file %q: %w", filePath, err) 235 } 236 237 return f, filePath, nil 238 } 239 } 240 241 func parseFilters(include []*recorderpb.Filter) ([]recorder.RecorderTuple, error) { 242 if len(include) == 0 { 243 return nil, errors.New("need to specify at least one include filter") 244 } 245 246 filters := []recorder.RecorderTuple{} 247 for _, f := range include { 248 srcIP, srcPrefix, err := net.ParseCIDR(f.GetSourceCidr()) 249 if err != nil { 250 return nil, fmt.Errorf("failed to parse source cidr %q: %w", f.GetSourceCidr(), err) 251 } 252 253 dstIP, dstPrefix, err := net.ParseCIDR(f.GetDestinationCidr()) 254 if err != nil { 255 return nil, fmt.Errorf("failed to parse source cidr %q: %w", f.GetDestinationCidr(), err) 256 } 257 258 if (srcIP.To4() == nil) != (dstIP.To4() == nil) { 259 return nil, fmt.Errorf("source (%s) and destination cidr (%s) must be same protocol version", 260 f.GetSourceCidr(), f.GetDestinationCidr()) 261 } 262 263 const maxPort = 65535 264 if f.GetSourcePort() > maxPort { 265 return nil, fmt.Errorf("source port %d out of range", f.GetSourcePort()) 266 } 267 268 if f.GetDestinationPort() > maxPort { 269 return nil, fmt.Errorf("destination port %d out of range", f.GetDestinationPort()) 270 } 271 272 filters = append(filters, recorder.RecorderTuple{ 273 SrcPrefix: *cidr.NewCIDR(srcPrefix), 274 SrcPort: uint16(f.GetSourcePort()), 275 DstPrefix: *cidr.NewCIDR(dstPrefix), 276 DstPort: uint16(f.GetDestinationPort()), 277 Proto: u8proto.U8proto(f.GetProtocol()), 278 }) 279 } 280 281 return filters, nil 282 } 283 284 var fileSinkPrefixRegex = regexp.MustCompile("^[a-z][a-z0-9]{0,19}$") 285 286 // startRecording starts a new recording. It will clean up any state 287 // associated with the recording if ctx is cancelled or handle.Stop is called. 288 func (s *Service) startRecording( 289 ctx context.Context, 290 req *recorderpb.StartRecording, 291 ) (handle *sink.Handle, filePath string, err error) { 292 capLen := req.GetMaxCaptureLength() 293 prefix := req.GetFilesink().GetFilePrefix() 294 if prefix == "" { 295 prefix = defaultFileSinkPrefix 296 } 297 298 if !fileSinkPrefixRegex.MatchString(prefix) { 299 return nil, "", fmt.Errorf("invalid file sink prefix: %q", prefix) 300 } 301 302 filters, err := parseFilters(req.GetInclude()) 303 if err != nil { 304 return nil, "", err 305 } 306 307 leaseID := s.ruleIDs.LeaseAvailableID() 308 ruleID := uint16(leaseID) 309 if leaseID == idpool.NoID { 310 return nil, "", errors.New("unable to allocate capture rule id") 311 } 312 313 var f *os.File 314 f, filePath, err = createPcapFile(s.opts.StoragePath, prefix) 315 if err != nil { 316 return nil, "", err 317 } 318 319 defer func() { 320 // clean up the recording if any of the subsequent steps fails 321 if err != nil { 322 _, _ = s.recorder.DeleteRecorder(recorder.ID(ruleID)) 323 // remove the created pcap file 324 _ = f.Close() 325 _ = os.Remove(filePath) 326 // release will also invalidate the lease 327 _ = s.ruleIDs.Release(idpool.ID(ruleID)) 328 } 329 }() 330 331 scopedLog := log.WithFields(logrus.Fields{ 332 "ruleID": ruleID, 333 "filePath": filePath, 334 }) 335 scopedLog.Debug("starting new recording") 336 337 stop := req.GetStopCondition() 338 config := sink.PcapSink{ 339 RuleID: ruleID, 340 Header: pcap.Header{ 341 SnapshotLength: capLen, 342 Datalink: pcap.Ethernet, 343 }, 344 Writer: pcap.NewWriter(f), 345 StopCondition: sink.StopConditions{ 346 PacketsCaptured: stop.GetPacketsCapturedCount(), 347 BytesCaptured: stop.GetBytesCapturedCount(), 348 DurationElapsed: stop.GetTimeElapsed().AsDuration(), 349 }, 350 } 351 352 // Upserting a new recorder can take up to a few seconds due to datapath 353 // regeneration. To avoid having the stop condition timer on the sink 354 // already running while the recorder is still being upserted, we install 355 // the recorder before the sink. This is safe, as sink.Dispatch silently 356 // ignores recordings for unknown sinks. 357 recInfo := &recorder.RecInfo{ 358 ID: recorder.ID(ruleID), 359 CapLen: uint16(capLen), 360 Filters: filters, 361 } 362 _, err = s.recorder.UpsertRecorder(recInfo) 363 if err != nil { 364 return nil, "", err 365 } 366 367 handle, err = s.dispatch.StartSink(ctx, config) 368 if err != nil { 369 return nil, "", err 370 } 371 372 // Ensure to delete the above recorder when the sink has stopped 373 go func() { 374 <-handle.Done 375 scopedLog.Debug("stopping recording") 376 _, err := s.recorder.DeleteRecorder(recorder.ID(ruleID)) 377 if err != nil { 378 scopedLog.WithError(err).Warning("failed to delete recorder") 379 } 380 s.ruleIDs.Release(idpool.ID(ruleID)) 381 }() 382 383 s.ruleIDs.Use(leaseID) 384 385 return handle, filePath, nil 386 }