github.com/cilium/cilium@v1.16.2/pkg/hubble/recorder/sink/dispatch.go (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright Authors of Cilium
     3  
     4  package sink
     5  
     6  import (
     7  	"context"
     8  	"fmt"
     9  	"runtime"
    10  
    11  	"github.com/sirupsen/logrus"
    12  	"golang.org/x/sys/unix"
    13  
    14  	"github.com/cilium/cilium/pkg/byteorder"
    15  	"github.com/cilium/cilium/pkg/hubble/recorder/pcap"
    16  	"github.com/cilium/cilium/pkg/lock"
    17  	"github.com/cilium/cilium/pkg/logging"
    18  	"github.com/cilium/cilium/pkg/logging/logfields"
    19  	"github.com/cilium/cilium/pkg/monitor"
    20  	monitorAPI "github.com/cilium/cilium/pkg/monitor/api"
    21  	"github.com/cilium/cilium/pkg/time"
    22  )
    23  
    24  var log = logging.DefaultLogger.WithField(logfields.LogSubsys, "recorder-sink")
    25  
    26  // record is a captured packet which will be written to file in the pcap format
    27  type record struct {
    28  	timestamp time.Time
    29  	ruleID    uint16
    30  	inclLen   uint32
    31  	origLen   uint32
    32  	data      []byte
    33  }
    34  
    35  // Handle enables the owner to subscribe to sink statistics
    36  type Handle struct {
    37  	// StatsUpdated is a channel on which receives a new empty message whenever
    38  	// there was an update to the sink statistics.
    39  	StatsUpdated <-chan struct{}
    40  	// Done is a channel which is closed when this sink has been shut down.
    41  	Done <-chan struct{}
    42  
    43  	sink *sink
    44  }
    45  
    46  // Stats returns the latest statistics for this sink.
    47  func (h *Handle) Stats() Statistics {
    48  	return h.sink.copyStats()
    49  }
    50  
    51  // Stop requests the underlying sink to stop. Handle.Done will be closed
    52  // once the sink has drained its queue and stopped.
    53  func (h *Handle) Stop() {
    54  	h.sink.stop()
    55  }
    56  
    57  // Err returns the last error on this sink once the channel has stopped
    58  func (h *Handle) Err() error {
    59  	return h.sink.err()
    60  }
    61  
    62  // Statistics contains the statistics for a pcap sink
    63  type Statistics struct {
    64  	PacketsWritten uint64
    65  	BytesWritten   uint64
    66  	PacketsLost    uint64
    67  	BytesLost      uint64
    68  }
    69  
    70  // StopConditions defines a set of values which cause the sink to stop
    71  // recording if any of them are hit. Zero-valued conditions are ignored.
    72  type StopConditions struct {
    73  	PacketsCaptured uint64
    74  	BytesCaptured   uint64
    75  	DurationElapsed time.Duration
    76  }
    77  
    78  // PcapSink defines the parameters of a sink which writes to a pcap.RecordWriter
    79  type PcapSink struct {
    80  	RuleID        uint16
    81  	Header        pcap.Header
    82  	Writer        pcap.RecordWriter
    83  	StopCondition StopConditions
    84  }
    85  
    86  // Dispatch implements consumer.MonitorConsumer and dispatches incoming
    87  // recorder captures to registered sinks based on their rule ID.
    88  type Dispatch struct {
    89  	mutex lock.RWMutex
    90  
    91  	bootTimeOffset int64
    92  
    93  	sinkQueueSize int
    94  	sinkByRuleID  map[uint16]*sink
    95  }
    96  
    97  // NewDispatch creates a new sink dispatcher. Each registered sink may have a
    98  // queue of up to sinkQueueSize pending captures.
    99  func NewDispatch(sinkQueueSize int) (*Dispatch, error) {
   100  	if sinkQueueSize < 1 {
   101  		return nil, fmt.Errorf("invalid sink queue size: %d", sinkQueueSize)
   102  	}
   103  
   104  	bootTimeOffset, err := estimateBootTimeOffset()
   105  	if err != nil {
   106  		return nil, fmt.Errorf("failed to obtain boot time clock: %w", err)
   107  	}
   108  
   109  	return &Dispatch{
   110  		bootTimeOffset: bootTimeOffset,
   111  		sinkQueueSize:  sinkQueueSize,
   112  		sinkByRuleID:   map[uint16]*sink{},
   113  	}, nil
   114  }
   115  
   116  // StartSink starts a new sink for the pcap sink configuration p. Any
   117  // captures with a matching rule ID will be forwarded to the pcap sink p.Writer.
   118  // The provided p.Header is written to the pcap sink during initialization.
   119  // The sink is unregistered automatically when it stops. A sink is stopped for
   120  // one of the following four reasons. In all cases, Handle.Done will be closed.
   121  //   - Explicitly via Handle.Stop (Handle.Err() == nil)
   122  //   - When one of the p.StopCondition is hit (Handle.Err() == nil)
   123  //   - When the context ctx is cancelled (Handle.Err() != nil)
   124  //   - When an error occurred (Handle.Err() != nil)
   125  func (d *Dispatch) StartSink(ctx context.Context, p PcapSink) (*Handle, error) {
   126  	d.mutex.Lock()
   127  	defer d.mutex.Unlock()
   128  
   129  	if _, ok := d.sinkByRuleID[p.RuleID]; ok {
   130  		return nil, fmt.Errorf("sink for rule id %d already registered", p.RuleID)
   131  	}
   132  
   133  	s := startSink(ctx, p, d.sinkQueueSize)
   134  	d.sinkByRuleID[p.RuleID] = s
   135  
   136  	go func() {
   137  		<-s.done
   138  		d.mutex.Lock()
   139  		delete(d.sinkByRuleID, p.RuleID)
   140  		d.mutex.Unlock()
   141  	}()
   142  
   143  	return &Handle{
   144  		StatsUpdated: s.trigger,
   145  		Done:         s.done,
   146  		sink:         s,
   147  	}, nil
   148  }
   149  
   150  func (d *Dispatch) decodeRecordCaptureLocked(data []byte) (rec record, err error) {
   151  	dataLen := uint32(len(data))
   152  	if dataLen < monitor.RecorderCaptureLen {
   153  		return record{}, fmt.Errorf("not enough data to decode capture message: %d", dataLen)
   154  	}
   155  
   156  	// This needs to stay in sync with struct capture_msg from
   157  	// bpf/include/pcap.h.
   158  	// We could use binary.Read on monitor.RecorderCapture, but since it
   159  	// requires reflection, it is too slow to use on the critical path here.
   160  	const (
   161  		offsetRuleID         = 2
   162  		offsetTimeBoot       = 8
   163  		offsetCaptureLength  = 16
   164  		offsetOriginalLength = 20
   165  	)
   166  	n := byteorder.Native
   167  	ruleID := n.Uint16(data[offsetRuleID:])
   168  	timeBoot := n.Uint64(data[offsetTimeBoot:])
   169  	capLen := n.Uint32(data[offsetCaptureLength:])
   170  	origLen := n.Uint32(data[offsetOriginalLength:])
   171  
   172  	// data may contain trailing garbage from the perf ring buffer
   173  	// https://lore.kernel.org/patchwork/patch/1244339/
   174  	packetEnd := monitor.RecorderCaptureLen + capLen
   175  	if dataLen < packetEnd {
   176  		return record{}, fmt.Errorf("capture record too short: want:%d < got:%d", dataLen, packetEnd)
   177  	}
   178  	packet := data[monitor.RecorderCaptureLen:packetEnd]
   179  
   180  	return record{
   181  		timestamp: time.Unix(0, d.bootTimeOffset+int64(timeBoot)),
   182  		ruleID:    ruleID,
   183  		inclLen:   capLen,
   184  		origLen:   origLen,
   185  		data:      packet,
   186  	}, nil
   187  }
   188  
   189  const estimationRounds = 25
   190  
   191  func estimateBootTimeOffset() (bootTimeOffset int64, err error) {
   192  	// The datapath is currently using ktime_get_boot_ns for the pcap timestamp,
   193  	// which corresponds to CLOCK_BOOTTIME. To be able to convert the the
   194  	// CLOCK_BOOTTIME to CLOCK_REALTIME (i.e. a unix timestamp).
   195  
   196  	// There can be an arbitrary amount of time between the execution of
   197  	// time.Now() and unix.ClockGettime() below, especially under scheduler
   198  	// pressure during program startup. To reduce the error introduced by these
   199  	// delays, we pin the current Go routine to its OS thread and measure the
   200  	// clocks multiple times, taking only the smallest observed difference
   201  	// between the two values (which implies the smallest possible delay
   202  	// between the two snapshots).
   203  	var minDiff int64 = 1<<63 - 1
   204  
   205  	runtime.LockOSThread()
   206  	defer runtime.UnlockOSThread()
   207  	for round := 0; round < estimationRounds; round++ {
   208  		var bootTimespec unix.Timespec
   209  
   210  		// Ideally we would use __vdso_clock_gettime for both clocks here,
   211  		// to have as little overhead as possible.
   212  		// time.Now() will actually use VDSO on Go 1.9+, but calling
   213  		// unix.ClockGettime to obtain CLOCK_BOOTTIME is a regular system call
   214  		// for now.
   215  		unixTime := time.Now()
   216  		err = unix.ClockGettime(unix.CLOCK_BOOTTIME, &bootTimespec)
   217  		if err != nil {
   218  			return 0, err
   219  		}
   220  
   221  		offset := unixTime.UnixNano() - bootTimespec.Nano()
   222  		diff := offset
   223  		if diff < 0 {
   224  			diff = -diff
   225  		}
   226  
   227  		if diff < minDiff {
   228  			minDiff = diff
   229  			bootTimeOffset = offset
   230  		}
   231  	}
   232  
   233  	return bootTimeOffset, nil
   234  }
   235  
   236  // NotifyPerfEvent implements consumer.MonitorConsumer
   237  func (d *Dispatch) NotifyPerfEvent(data []byte, cpu int) {
   238  	if len(data) == 0 || data[0] != monitorAPI.MessageTypeRecCapture {
   239  		return
   240  	}
   241  
   242  	d.mutex.Lock()
   243  	defer d.mutex.Unlock()
   244  
   245  	rec, err := d.decodeRecordCaptureLocked(data)
   246  	if err != nil {
   247  		log.WithError(err).Warning("Failed to parse capture record")
   248  		return
   249  	}
   250  
   251  	// We silently drop records with unknown rule ids
   252  	if s, ok := d.sinkByRuleID[rec.ruleID]; ok {
   253  		s.enqueue(rec)
   254  	}
   255  }
   256  
   257  // NotifyPerfEventLost implements consumer.MonitorConsumer
   258  func (d *Dispatch) NotifyPerfEventLost(numLostEvents uint64, cpu int) {
   259  	log.WithFields(logrus.Fields{
   260  		"numEvents": numLostEvents,
   261  		"cpu":       cpu,
   262  	}).Warning("Perf ring buffer events lost. This may affect captured packets.")
   263  }
   264  
   265  // NotifyAgentEvent implements consumer.MonitorConsumer
   266  func (d *Dispatch) NotifyAgentEvent(typ int, message interface{}) {
   267  	// ignored
   268  }