github.com/cilium/cilium@v1.16.2/pkg/hubble/recorder/service.go (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright Authors of Cilium
     3  
     4  package recorder
     5  
     6  import (
     7  	"context"
     8  	"errors"
     9  	"fmt"
    10  	"math/rand/v2"
    11  	"net"
    12  	"os"
    13  	"path"
    14  	"regexp"
    15  
    16  	"github.com/sirupsen/logrus"
    17  	"google.golang.org/protobuf/types/known/timestamppb"
    18  
    19  	recorderpb "github.com/cilium/cilium/api/v1/recorder"
    20  	"github.com/cilium/cilium/pkg/cidr"
    21  	"github.com/cilium/cilium/pkg/hubble/recorder/pcap"
    22  	"github.com/cilium/cilium/pkg/hubble/recorder/recorderoption"
    23  	"github.com/cilium/cilium/pkg/hubble/recorder/sink"
    24  	"github.com/cilium/cilium/pkg/idpool"
    25  	"github.com/cilium/cilium/pkg/logging"
    26  	"github.com/cilium/cilium/pkg/logging/logfields"
    27  	nodeTypes "github.com/cilium/cilium/pkg/node/types"
    28  	"github.com/cilium/cilium/pkg/recorder"
    29  	"github.com/cilium/cilium/pkg/time"
    30  	"github.com/cilium/cilium/pkg/u8proto"
    31  )
    32  
    33  var log = logging.DefaultLogger.WithField(logfields.LogSubsys, "hubble-recorder")
    34  
    35  var _ recorderpb.RecorderServer = (*Service)(nil)
    36  
    37  const (
    38  	minRuleID = 1
    39  	maxRuleID = 65534
    40  
    41  	defaultFileSinkPrefix = "hubble"
    42  )
    43  
    44  type Service struct {
    45  	recorder *recorder.Recorder
    46  	dispatch *sink.Dispatch
    47  	ruleIDs  *idpool.IDPool
    48  	opts     recorderoption.Options
    49  }
    50  
    51  func NewService(r *recorder.Recorder, d *sink.Dispatch, options ...recorderoption.Option) (*Service, error) {
    52  	opts := recorderoption.Default
    53  	for _, o := range options {
    54  		if err := o(&opts); err != nil {
    55  			return nil, err
    56  		}
    57  	}
    58  
    59  	if len(opts.StoragePath) == 0 {
    60  		return nil, errors.New("storage path must not be empty")
    61  	}
    62  
    63  	if err := os.MkdirAll(opts.StoragePath, 0600); err != nil {
    64  		return nil, fmt.Errorf("failed to create storage path directory: %w", err)
    65  	}
    66  
    67  	return &Service{
    68  		recorder: r,
    69  		dispatch: d,
    70  		ruleIDs:  idpool.NewIDPool(minRuleID, maxRuleID),
    71  		opts:     opts,
    72  	}, nil
    73  }
    74  
    75  func recordingStoppedResponse(stats sink.Statistics, filePath string) *recorderpb.RecordResponse {
    76  	return &recorderpb.RecordResponse{
    77  		NodeName: nodeTypes.GetAbsoluteNodeName(),
    78  		Time:     timestamppb.Now(),
    79  		ResponseType: &recorderpb.RecordResponse_Stopped{
    80  			Stopped: &recorderpb.RecordingStoppedResponse{
    81  				Stats: &recorderpb.RecordingStatistics{
    82  					BytesCaptured:   stats.BytesWritten,
    83  					PacketsCaptured: stats.PacketsWritten,
    84  					BytesLost:       stats.BytesLost,
    85  					PacketsLost:     stats.PacketsLost,
    86  				},
    87  				Filesink: &recorderpb.FileSinkResult{
    88  					FilePath: filePath,
    89  				},
    90  			},
    91  		},
    92  	}
    93  }
    94  
    95  func recordingRunningResponse(stats sink.Statistics) *recorderpb.RecordResponse {
    96  	return &recorderpb.RecordResponse{
    97  		NodeName: nodeTypes.GetAbsoluteNodeName(),
    98  		Time:     timestamppb.Now(),
    99  		ResponseType: &recorderpb.RecordResponse_Running{
   100  			Running: &recorderpb.RecordingRunningResponse{
   101  				Stats: &recorderpb.RecordingStatistics{
   102  					BytesCaptured:   stats.BytesWritten,
   103  					PacketsCaptured: stats.PacketsWritten,
   104  					BytesLost:       stats.BytesLost,
   105  					PacketsLost:     stats.PacketsLost,
   106  				},
   107  			},
   108  		},
   109  	}
   110  }
   111  
   112  func (s *Service) Record(stream recorderpb.Recorder_RecordServer) error {
   113  	ctx, cancel := context.WithCancel(stream.Context())
   114  	defer cancel()
   115  
   116  	// Spawn a goroutine that forwards any received messages in order to be
   117  	// able to use select on it
   118  	reqCh := make(chan *recorderpb.RecordRequest)
   119  	errCh := make(chan error, 1)
   120  	go func() {
   121  		for {
   122  			req, err := stream.Recv()
   123  			if err != nil {
   124  				errCh <- fmt.Errorf("failed to receive from recorder client: %w", err)
   125  				return
   126  			}
   127  
   128  			select {
   129  			case reqCh <- req:
   130  			case <-ctx.Done():
   131  				return
   132  			}
   133  		}
   134  	}()
   135  
   136  	var (
   137  		recording *sink.Handle
   138  		filePath  string
   139  		err       error
   140  	)
   141  
   142  	// Wait for the initial StartRecording message
   143  	select {
   144  	case req := <-reqCh:
   145  		startRecording := req.GetStart()
   146  		if startRecording == nil {
   147  			return fmt.Errorf("received invalid request %q, expected start request", req)
   148  		}
   149  
   150  		// The startRecording helper spawns a clean up goroutine to remove all
   151  		// state associated with this recording when the context ctx is cancelled.
   152  		recording, filePath, err = s.startRecording(ctx, startRecording)
   153  		if err != nil {
   154  			return err
   155  		}
   156  	case err = <-errCh:
   157  		return err
   158  	case <-ctx.Done():
   159  		return ctx.Err()
   160  	}
   161  
   162  	// Send back a confirmation that the recording has started
   163  	err = stream.Send(recordingRunningResponse(recording.Stats()))
   164  	if err != nil {
   165  		return fmt.Errorf("failed to confirmation response: %w", err)
   166  	}
   167  
   168  	for {
   169  		select {
   170  		// This case happens when the client has sent us a new request.
   171  		// We expect a start request if recording is nil, and a stop request
   172  		// otherwise.
   173  		case req := <-reqCh:
   174  			if req.GetStop() != nil {
   175  				recording.Stop()
   176  			} else {
   177  				return fmt.Errorf("received invalid request %q, expected stop request", req)
   178  			}
   179  		// This case is hit whenever the recording has updated the statistics (i.e.
   180  		// packets have been captured). We fetch the latest statistics and forward
   181  		// them to the client
   182  		case <-recording.StatsUpdated:
   183  			err = stream.Send(recordingRunningResponse(recording.Stats()))
   184  			if err != nil {
   185  				return fmt.Errorf("failed to send recording running response: %w", err)
   186  			}
   187  		// This case happens when the recording has stopped (i.e. due to the above
   188  		// explicit shutdown or because an error has occurred). If no error has
   189  		// occurred, we assemble the final RecordingStoppedResponse and exit.
   190  		// If an error occurred, we propagate it by returning it from this stub.
   191  		case <-recording.Done:
   192  			err = recording.Err()
   193  			if err != nil {
   194  				return fmt.Errorf("recorder recording error: %w", err)
   195  			}
   196  
   197  			err = stream.Send(recordingStoppedResponse(recording.Stats(), filePath))
   198  			if err != nil {
   199  				return fmt.Errorf("failed to send recording stopped response: %w", err)
   200  			}
   201  
   202  			return nil
   203  		// The following two cases happen when the client stream is either
   204  		// closed or cancelled. Simply return an error such that it is logged,
   205  		// and exit.
   206  		case err = <-errCh:
   207  			return err
   208  		case <-ctx.Done():
   209  			return ctx.Err()
   210  		}
   211  	}
   212  }
   213  
   214  const fileExistsRetries = 100
   215  
   216  var allowedFileChars = regexp.MustCompile("[^a-zA-Z0-9_.-]")
   217  
   218  func createPcapFile(basedir, prefix string) (f *os.File, filePath string, err error) {
   219  	try := 0
   220  	for {
   221  		startTime := time.Now().Unix()
   222  		random := rand.Uint32()
   223  		nodeName := nodeTypes.GetAbsoluteNodeName()
   224  		name := fmt.Sprintf("%s_%d_%d_%s.pcap", prefix, startTime, random, nodeName)
   225  		sanitizedName := allowedFileChars.ReplaceAllLiteralString(name, "_")
   226  		filePath = path.Join(basedir, sanitizedName)
   227  		f, err = os.OpenFile(filePath, os.O_RDWR|os.O_CREATE|os.O_EXCL, 0600)
   228  		if err != nil {
   229  			if os.IsExist(err) {
   230  				if try++; try < fileExistsRetries {
   231  					continue
   232  				}
   233  			}
   234  			return f, "", fmt.Errorf("failed to create pcap file %q: %w", filePath, err)
   235  		}
   236  
   237  		return f, filePath, nil
   238  	}
   239  }
   240  
   241  func parseFilters(include []*recorderpb.Filter) ([]recorder.RecorderTuple, error) {
   242  	if len(include) == 0 {
   243  		return nil, errors.New("need to specify at least one include filter")
   244  	}
   245  
   246  	filters := []recorder.RecorderTuple{}
   247  	for _, f := range include {
   248  		srcIP, srcPrefix, err := net.ParseCIDR(f.GetSourceCidr())
   249  		if err != nil {
   250  			return nil, fmt.Errorf("failed to parse source cidr %q: %w", f.GetSourceCidr(), err)
   251  		}
   252  
   253  		dstIP, dstPrefix, err := net.ParseCIDR(f.GetDestinationCidr())
   254  		if err != nil {
   255  			return nil, fmt.Errorf("failed to parse source cidr %q: %w", f.GetDestinationCidr(), err)
   256  		}
   257  
   258  		if (srcIP.To4() == nil) != (dstIP.To4() == nil) {
   259  			return nil, fmt.Errorf("source (%s) and destination cidr (%s) must be same protocol version",
   260  				f.GetSourceCidr(), f.GetDestinationCidr())
   261  		}
   262  
   263  		const maxPort = 65535
   264  		if f.GetSourcePort() > maxPort {
   265  			return nil, fmt.Errorf("source port %d out of range", f.GetSourcePort())
   266  		}
   267  
   268  		if f.GetDestinationPort() > maxPort {
   269  			return nil, fmt.Errorf("destination port %d out of range", f.GetDestinationPort())
   270  		}
   271  
   272  		filters = append(filters, recorder.RecorderTuple{
   273  			SrcPrefix: *cidr.NewCIDR(srcPrefix),
   274  			SrcPort:   uint16(f.GetSourcePort()),
   275  			DstPrefix: *cidr.NewCIDR(dstPrefix),
   276  			DstPort:   uint16(f.GetDestinationPort()),
   277  			Proto:     u8proto.U8proto(f.GetProtocol()),
   278  		})
   279  	}
   280  
   281  	return filters, nil
   282  }
   283  
   284  var fileSinkPrefixRegex = regexp.MustCompile("^[a-z][a-z0-9]{0,19}$")
   285  
   286  // startRecording starts a new recording. It will clean up any state
   287  // associated with the recording if ctx is cancelled or handle.Stop is called.
   288  func (s *Service) startRecording(
   289  	ctx context.Context,
   290  	req *recorderpb.StartRecording,
   291  ) (handle *sink.Handle, filePath string, err error) {
   292  	capLen := req.GetMaxCaptureLength()
   293  	prefix := req.GetFilesink().GetFilePrefix()
   294  	if prefix == "" {
   295  		prefix = defaultFileSinkPrefix
   296  	}
   297  
   298  	if !fileSinkPrefixRegex.MatchString(prefix) {
   299  		return nil, "", fmt.Errorf("invalid file sink prefix: %q", prefix)
   300  	}
   301  
   302  	filters, err := parseFilters(req.GetInclude())
   303  	if err != nil {
   304  		return nil, "", err
   305  	}
   306  
   307  	leaseID := s.ruleIDs.LeaseAvailableID()
   308  	ruleID := uint16(leaseID)
   309  	if leaseID == idpool.NoID {
   310  		return nil, "", errors.New("unable to allocate capture rule id")
   311  	}
   312  
   313  	var f *os.File
   314  	f, filePath, err = createPcapFile(s.opts.StoragePath, prefix)
   315  	if err != nil {
   316  		return nil, "", err
   317  	}
   318  
   319  	defer func() {
   320  		// clean up the recording if any of the subsequent steps fails
   321  		if err != nil {
   322  			_, _ = s.recorder.DeleteRecorder(recorder.ID(ruleID))
   323  			// remove the created pcap file
   324  			_ = f.Close()
   325  			_ = os.Remove(filePath)
   326  			// release will also invalidate the lease
   327  			_ = s.ruleIDs.Release(idpool.ID(ruleID))
   328  		}
   329  	}()
   330  
   331  	scopedLog := log.WithFields(logrus.Fields{
   332  		"ruleID":   ruleID,
   333  		"filePath": filePath,
   334  	})
   335  	scopedLog.Debug("starting new recording")
   336  
   337  	stop := req.GetStopCondition()
   338  	config := sink.PcapSink{
   339  		RuleID: ruleID,
   340  		Header: pcap.Header{
   341  			SnapshotLength: capLen,
   342  			Datalink:       pcap.Ethernet,
   343  		},
   344  		Writer: pcap.NewWriter(f),
   345  		StopCondition: sink.StopConditions{
   346  			PacketsCaptured: stop.GetPacketsCapturedCount(),
   347  			BytesCaptured:   stop.GetBytesCapturedCount(),
   348  			DurationElapsed: stop.GetTimeElapsed().AsDuration(),
   349  		},
   350  	}
   351  
   352  	// Upserting a new recorder can take up to a few seconds due to datapath
   353  	// regeneration. To avoid having the stop condition timer on the sink
   354  	// already running while the recorder is still being upserted, we install
   355  	// the recorder before the sink. This is safe, as sink.Dispatch silently
   356  	// ignores recordings for unknown sinks.
   357  	recInfo := &recorder.RecInfo{
   358  		ID:      recorder.ID(ruleID),
   359  		CapLen:  uint16(capLen),
   360  		Filters: filters,
   361  	}
   362  	_, err = s.recorder.UpsertRecorder(recInfo)
   363  	if err != nil {
   364  		return nil, "", err
   365  	}
   366  
   367  	handle, err = s.dispatch.StartSink(ctx, config)
   368  	if err != nil {
   369  		return nil, "", err
   370  	}
   371  
   372  	// Ensure to delete the above recorder when the sink has stopped
   373  	go func() {
   374  		<-handle.Done
   375  		scopedLog.Debug("stopping recording")
   376  		_, err := s.recorder.DeleteRecorder(recorder.ID(ruleID))
   377  		if err != nil {
   378  			scopedLog.WithError(err).Warning("failed to delete recorder")
   379  		}
   380  		s.ruleIDs.Release(idpool.ID(ruleID))
   381  	}()
   382  
   383  	s.ruleIDs.Use(leaseID)
   384  
   385  	return handle, filePath, nil
   386  }