github.com/castai/kvisor@v1.7.1-0.20240516114728-b3572a2607b5/pkg/ebpftracer/tracer.go (about) 1 package ebpftracer 2 3 import ( 4 "bytes" 5 "context" 6 "encoding/binary" 7 "errors" 8 "fmt" 9 "net/netip" 10 "os" 11 "sync" 12 "time" 13 14 "github.com/castai/kvisor/pkg/cgroup" 15 "github.com/castai/kvisor/pkg/containers" 16 "github.com/castai/kvisor/pkg/ebpftracer/events" 17 "github.com/castai/kvisor/pkg/ebpftracer/signature" 18 "github.com/castai/kvisor/pkg/ebpftracer/types" 19 "github.com/castai/kvisor/pkg/logging" 20 "github.com/castai/kvisor/pkg/metrics" 21 "github.com/castai/kvisor/pkg/proc" 22 "github.com/cilium/ebpf/perf" 23 "github.com/go-playground/validator/v10" 24 "github.com/google/gopacket/layers" 25 "github.com/samber/lo" 26 "golang.org/x/sync/errgroup" 27 "golang.org/x/sys/unix" 28 ) 29 30 // ActualDestinationGetter is used to find actual destination ip. 31 // Usually this info is obtained from conntrack. 32 type ActualDestinationGetter interface { 33 GetDestination(src, dst netip.AddrPort) (netip.AddrPort, bool) 34 } 35 36 type ContainerClient interface { 37 GetContainerForCgroup(ctx context.Context, cgroup cgroup.ID) (*containers.Container, error) 38 CleanupCgroup(cgroup cgroup.ID) 39 } 40 41 type CgroupClient interface { 42 LoadCgroup(id cgroup.ID, path string) 43 CleanupCgroup(cgroup cgroup.ID) 44 IsDefaultHierarchy(uint32) bool 45 } 46 47 type Config struct { 48 BTFPath string 49 EventsPerCPUBuffer int 50 EventsOutputChanSize int 51 GCInterval time.Duration 52 DefaultCgroupsVersion string `validate:"required,oneof=V1 V2"` 53 DebugEnabled bool 54 ContainerClient ContainerClient 55 CgroupClient CgroupClient 56 SignatureEngine *signature.SignatureEngine 57 MountNamespacePIDStore *types.PIDsPerNamespace 58 // All PIPs reported from ebpf will be normalized to this PID namespace 59 HomePIDNS proc.NamespaceID 60 AllowAnyEvent bool 61 NetflowOutputChanSize int 62 NetflowSampleSubmitIntervalSeconds uint64 63 } 64 65 type cgroupCleanupRequest struct { 66 cgroupID cgroup.ID 67 cleanupAfter time.Time 68 } 69 70 type Tracer struct { 71 log *logging.Logger 72 cfg Config 73 74 bootTime uint64 75 76 module *module 77 eventsSet map[events.ID]definition 78 79 policyMu sync.Mutex 80 policy *Policy 81 eventPoliciesMap map[events.ID]*EventPolicy 82 cgroupEventPolicy map[cgroup.ID]map[events.ID]*cgroupEventPolicy 83 signatureEventMap map[events.ID]struct{} 84 85 eventsChan chan *types.Event 86 netflowEventsChan chan *types.Event 87 88 removedCgroupsMu sync.Mutex 89 removedCgroups map[uint64]struct{} 90 91 dnsPacketParser *layers.DNS 92 93 cgroupCleanupMu sync.Mutex 94 requestedCgroupCleanups []cgroupCleanupRequest 95 96 cleanupTimerTickRate time.Duration 97 cgroupCleanupDelay time.Duration 98 } 99 100 func New(log *logging.Logger, cfg Config) *Tracer { 101 if err := validator.New().Struct(cfg); err != nil { 102 panic(fmt.Errorf("invalid ebpftracer config: %w", err).Error()) 103 } 104 105 log = log.WithField("component", "ebpftracer") 106 m := newModule(log, moduleConfig{ 107 BTFObjPath: cfg.BTFPath, 108 }) 109 110 if cfg.EventsPerCPUBuffer == 0 { 111 cfg.EventsPerCPUBuffer = 8192 112 } 113 if cfg.EventsOutputChanSize == 0 { 114 cfg.EventsOutputChanSize = 16384 115 } 116 if cfg.GCInterval == 0 { 117 cfg.GCInterval = 15 * time.Second 118 } 119 120 var ts unix.Timespec 121 err := unix.ClockGettime(unix.CLOCK_MONOTONIC, &ts) 122 if err != nil { 123 panic(fmt.Errorf("getting clock time: %w", err).Error()) 124 } 125 bootTime := time.Now().UnixNano() - ts.Nano() 126 127 t := &Tracer{ 128 log: log, 129 cfg: cfg, 130 module: m, 131 bootTime: uint64(bootTime), 132 eventsChan: make(chan *types.Event, cfg.EventsOutputChanSize), 133 netflowEventsChan: make(chan *types.Event, cfg.NetflowOutputChanSize), 134 removedCgroups: map[uint64]struct{}{}, 135 eventPoliciesMap: map[events.ID]*EventPolicy{}, 136 cgroupEventPolicy: map[uint64]map[events.ID]*cgroupEventPolicy{}, 137 dnsPacketParser: &layers.DNS{}, 138 signatureEventMap: map[events.ID]struct{}{}, 139 cleanupTimerTickRate: 1 * time.Minute, 140 cgroupCleanupDelay: 1 * time.Minute, 141 } 142 143 return t 144 } 145 146 func (t *Tracer) Load() error { 147 if err := t.module.load(t.cfg.HomePIDNS, t.cfg.NetflowSampleSubmitIntervalSeconds); err != nil { 148 return fmt.Errorf("loading ebpf module: %w", err) 149 } 150 t.eventsSet = newEventsDefinitionSet(t.module.objects) 151 return nil 152 } 153 154 func (t *Tracer) Close() error { 155 return t.module.close() 156 } 157 158 func (t *Tracer) Run(ctx context.Context) error { 159 t.log.Infof("running") 160 defer t.log.Infof("stopping") 161 162 if !t.module.loaded.Load() { 163 return errors.New("tracer is not loaded") 164 } 165 errg, ctx := errgroup.WithContext(ctx) 166 if t.cfg.DebugEnabled { 167 errg.Go(func() error { 168 return t.debugEventsLoop(ctx) 169 }) 170 } 171 errg.Go(func() error { 172 return t.eventsReadLoop(ctx) 173 }) 174 errg.Go(func() error { 175 return t.signalReadLoop(ctx) 176 }) 177 errg.Go(func() error { 178 return t.cgroupCleanupLoop(ctx) 179 }) 180 181 return errg.Wait() 182 } 183 184 func (t *Tracer) Events() <-chan *types.Event { 185 return t.eventsChan 186 } 187 188 func (t *Tracer) NetflowEvents() <-chan *types.Event { 189 return t.netflowEventsChan 190 } 191 192 func (t *Tracer) GetEventName(id events.ID) string { 193 if def, found := t.eventsSet[id]; found { 194 return def.name 195 } 196 return "" 197 } 198 199 func (t *Tracer) signalReadLoop(ctx context.Context) error { 200 eventsReader, err := perf.NewReader(t.module.objects.Signals, t.cfg.EventsPerCPUBuffer) 201 if err != nil { 202 return err 203 } 204 defer eventsReader.Close() 205 206 for { 207 select { 208 case <-ctx.Done(): 209 return ctx.Err() 210 default: 211 } 212 record, err := eventsReader.Read() 213 if err != nil { 214 if t.cfg.DebugEnabled { 215 t.log.Warnf("reading signals: %v", err) 216 } 217 continue 218 } 219 if record.LostSamples > 0 { 220 t.log.Warnf("lost %d signals", record.LostSamples) 221 metrics.AgentKernelLostEventsTotal.Add(float64(record.LostSamples)) 222 continue 223 } 224 metrics.AgentPulledEventsTotal.Inc() 225 226 if err := t.decodeAndHandleSignal(ctx, record.RawSample); err != nil { 227 if t.cfg.DebugEnabled || errors.Is(err, ErrPanic) { 228 t.log.Errorf("decoding signal: %v", err) 229 } 230 metrics.AgentDecodeEventErrorsTotal.Inc() 231 continue 232 } 233 } 234 } 235 236 func (t *Tracer) eventsReadLoop(ctx context.Context) error { 237 eventsReader, err := perf.NewReader(t.module.objects.Events, t.cfg.EventsPerCPUBuffer) 238 if err != nil { 239 return err 240 } 241 defer eventsReader.Close() 242 243 for { 244 select { 245 case <-ctx.Done(): 246 return ctx.Err() 247 default: 248 } 249 250 record, err := eventsReader.Read() 251 if err != nil { 252 if t.cfg.DebugEnabled { 253 t.log.Warnf("reading event: %v", err) 254 } 255 continue 256 } 257 if record.LostSamples > 0 { 258 t.log.Warnf("lost %d events", record.LostSamples) 259 metrics.AgentKernelLostEventsTotal.Add(float64(record.LostSamples)) 260 continue 261 } 262 metrics.AgentPulledEventsTotal.Inc() 263 264 if err := t.decodeAndExportEvent(ctx, record.RawSample); err != nil { 265 if t.cfg.DebugEnabled || errors.Is(err, ErrPanic) { 266 t.log.Errorf("decoding event: %v", err) 267 } 268 metrics.AgentDecodeEventErrorsTotal.Inc() 269 continue 270 } 271 } 272 } 273 274 func (t *Tracer) findAllRequiredEvents(id events.ID, out map[events.ID]struct{}) { 275 // No need to load the whole dependency tree twice 276 if _, found := out[id]; found { 277 return 278 } 279 280 def := t.eventsSet[id] 281 out[id] = struct{}{} 282 for _, def := range def.dependencies.ids { 283 t.findAllRequiredEvents(def, out) 284 } 285 } 286 287 func (t *Tracer) ApplyPolicy(policy *Policy) error { 288 if !t.module.loaded.Load() { 289 return errors.New("tracer is not loaded") 290 } 291 t.policyMu.Lock() 292 defer t.policyMu.Unlock() 293 294 if t.policy != nil { 295 // TODO(Kvisord): Here we can add policy diff with previous one and dynamically update policy. 296 return errors.New("policy update is not supported yet") 297 } 298 299 t.policy = policy 300 for _, event := range t.policy.Events { 301 event := event 302 t.eventPoliciesMap[event.ID] = event 303 } 304 305 eventsParams := getParamTypes(t.eventsSet) 306 requiredEventsIDs := make(map[events.ID]struct{}) 307 for _, event := range policy.Events { 308 event := event 309 t.eventPoliciesMap[event.ID] = event 310 t.findAllRequiredEvents(event.ID, requiredEventsIDs) 311 } 312 if t.cfg.SignatureEngine != nil { 313 requiredSignatureEvents := policy.SignatureEvents 314 for _, eventID := range requiredSignatureEvents { 315 t.signatureEventMap[eventID] = struct{}{} 316 } 317 for _, eventID := range requiredSignatureEvents { 318 t.findAllRequiredEvents(eventID, requiredEventsIDs) 319 } 320 } 321 322 for _, eventID := range policy.SystemEvents { 323 t.findAllRequiredEvents(eventID, requiredEventsIDs) 324 } 325 326 eventsBpfMapConfig := make(map[events.ID][]byte) 327 328 objs := t.module.objects 329 330 var tailCalls []TailCall 331 probesToAttach := map[handle]bool{} 332 for id := range requiredEventsIDs { 333 def, found := t.eventsSet[id] 334 if !found { 335 return fmt.Errorf("missing event definition for id %d", id) 336 } 337 338 tailCalls = append(tailCalls, def.dependencies.tailCalls...) 339 if def.syscall { 340 probesToAttach[ProbeSyscallEnter__Internal] = true 341 probesToAttach[ProbeSyscallExit__Internal] = true 342 // Add default tail calls for syscall events. 343 if len(def.dependencies.tailCalls) == 0 && !def.dependencies.skipDefaultTailCalls { 344 tailCalls = append(tailCalls, getDefaultSyscallTailCalls(objs, def)...) 345 } 346 } 347 for _, dep := range def.dependencies.probes { 348 if required, found := probesToAttach[dep.handle]; found { 349 if !required { 350 probesToAttach[dep.handle] = dep.required 351 } 352 } else { 353 probesToAttach[dep.handle] = dep.required 354 } 355 } 356 357 eventConfigVal := marshalEventConfig(eventsParams, id) 358 eventsBpfMapConfig[id] = eventConfigVal 359 } 360 361 // Attach selected probes. 362 for handle, required := range probesToAttach { 363 if err := t.module.attachProbe(handle); err != nil { 364 if required { 365 return fmt.Errorf("attaching probe %d: %w", handle, err) 366 } else { 367 t.log.Warnf("attaching optional probe %d: %v", handle, err) 368 } 369 } 370 } 371 372 // Send events configs in events ebpf map. 373 for id, cfg := range eventsBpfMapConfig { 374 if err := t.module.objects.EventsMap.Update(&id, cfg, 0); err != nil { 375 return fmt.Errorf("updating events map, event %d: %w", id, err) 376 } 377 } 378 config := t.computeConfigValues(policy) 379 if err := t.module.objects.ConfigMap.Update(uint32(0), config, 0); err != nil { 380 return fmt.Errorf("updating config map: %w", err) 381 } 382 383 // Initialize tail call dependencies. 384 for _, tailCall := range tailCalls { 385 err := t.initTailCall(tailCall) 386 if err != nil { 387 return fmt.Errorf("failed to initialize tail call: %w", err) 388 } 389 } 390 391 return nil 392 } 393 394 func marshalEventConfig(eventsParams map[events.ID][]ArgType, id events.ID) []byte { 395 eventConfigVal := make([]byte, 16) 396 // bitmap of policies that require this event to be submitted 397 binary.LittleEndian.PutUint64(eventConfigVal[0:8], 1) 398 // encoded event's parameter types 399 var paramTypes uint64 400 params := eventsParams[id] 401 for n, paramType := range params { 402 paramTypes = paramTypes | (uint64(paramType) << (8 * n)) 403 } 404 binary.LittleEndian.PutUint64(eventConfigVal[8:16], paramTypes) 405 return eventConfigVal 406 } 407 408 func getDefaultSyscallTailCalls(objs *tracerObjects, def definition) []TailCall { 409 return []TailCall{ 410 {objs.SysEnterInitTail, objs.SysEnterInit, []uint32{uint32(def.ID)}}, 411 {objs.SysEnterSubmitTail, objs.SysEnterSubmit, []uint32{uint32(def.ID)}}, 412 {objs.SysExitInitTail, objs.SysExitInit, []uint32{uint32(def.ID)}}, 413 {objs.SysExitSubmitTail, objs.SysExitSubmit, []uint32{uint32(def.ID)}}, 414 } 415 } 416 417 func getParamTypes(eventsSet map[events.ID]definition) map[events.ID][]ArgType { 418 eventsParams := make(map[events.ID][]ArgType) 419 for _, eventDefinition := range eventsSet { 420 id := eventDefinition.ID 421 params := eventDefinition.params 422 for _, param := range params { 423 eventsParams[id] = append(eventsParams[id], getParamType(param.Type)) 424 } 425 } 426 return eventsParams 427 } 428 429 const ( 430 optExecEnv uint32 = 1 << iota 431 optCaptureFilesWrite 432 optExtractDynCode 433 optStackAddresses 434 optCaptureModules 435 optCgroupV1 436 optTranslateFDFilePath 437 optCaptureBpf 438 optCaptureFileRead 439 ) 440 441 func (t *Tracer) getOptionsConfig(p *Policy) uint32 { 442 var cOptVal uint32 443 444 if p.Output.ExecEnv { 445 cOptVal = cOptVal | optExecEnv 446 } 447 if p.Output.StackAddresses { 448 cOptVal = cOptVal | optStackAddresses 449 } 450 // TODO: Check other options. 451 //if t.config.Capture.FileWrite.Capture { 452 // cOptVal = cOptVal | optCaptureFilesWrite 453 //} 454 //if t.config.Capture.FileRead.Capture { 455 // cOptVal = cOptVal | optCaptureFileRead 456 //} 457 //if t.config.Capture.Module { 458 // cOptVal = cOptVal | optCaptureModules 459 //} 460 //if t.config.Capture.Bpf { 461 // cOptVal = cOptVal | optCaptureBpf 462 //} 463 //if t.config.Capture.Mem { 464 // cOptVal = cOptVal | optExtractDynCode 465 //} 466 //if t.config.Output.ParseArgumentsFDs { 467 // cOptVal = cOptVal | optTranslateFDFilePath 468 //} 469 if t.cfg.DefaultCgroupsVersion == "V1" { 470 cOptVal = cOptVal | optCgroupV1 471 } 472 return cOptVal 473 } 474 475 func (t *Tracer) computeConfigValues(p *Policy) []byte { 476 // config_entry 477 configVal := make([]byte, 256) 478 479 // tracee_pid 480 binary.LittleEndian.PutUint32(configVal[0:4], uint32(os.Getpid())) 481 // options 482 binary.LittleEndian.PutUint32(configVal[4:8], t.getOptionsConfig(p)) 483 // cgroup_v1_hid 484 //binary.LittleEndian.PutUint32(configVal[8:12], uint32(t.containers.GetDefaultCgroupHierarchyID())) 485 binary.LittleEndian.PutUint32(configVal[8:12], 0) 486 // padding 487 binary.LittleEndian.PutUint32(configVal[12:16], 0) 488 489 id := 0 490 byteIndex := id / 8 491 bitOffset := id % 8 492 493 // enabled_scopes 494 configVal[216+byteIndex] |= 1 << bitOffset 495 496 // compute all policies internals 497 //t.config.Policies.Compute() 498 499 // uid_max 500 //binary.LittleEndian.PutUint64(configVal[224:232], t.config.Policies.UIDFilterMax()) 501 //// uid_min 502 //binary.LittleEndian.PutUint64(configVal[232:240], t.config.Policies.UIDFilterMin()) 503 //// pid_max 504 //binary.LittleEndian.PutUint64(configVal[240:248], t.config.Policies.PIDFilterMax()) 505 //// pid_min 506 //binary.LittleEndian.PutUint64(configVal[248:256], t.config.Policies.PIDFilterMin()) 507 508 return configVal 509 } 510 511 func (t *Tracer) initTailCall(tailCall TailCall) error { 512 tailCallIndexes := tailCall.indexes 513 // Pick eBPF program file descriptor. 514 bpfProgFD := uint32(tailCall.ebpfProg.FD()) 515 if tailCall.ebpfProg.FD() < 0 { 516 return fmt.Errorf("ebpf tail call map fd is negative") 517 } 518 519 t.log.Debugf("init tail call, map=%s, prog=%s", tailCall.ebpfMap.String(), tailCall.ebpfProg.String()) 520 521 // Pick all indexes (event, or syscall, IDs) the BPF program should be related to. 522 for _, index := range tailCallIndexes { 523 index := index 524 // Special treatment for indexes of syscall events. 525 if t.eventsSet[events.ID(index)].syscall { 526 // Workaround: Do not map eBPF program to unsupported syscalls (arm64, e.g.) 527 if index >= uint32(events.Unsupported) { 528 continue 529 } 530 } 531 // Update given eBPF map with the eBPF program file descriptor at given index. 532 err := tailCall.ebpfMap.Update(&index, &bpfProgFD, 0) 533 if err != nil { 534 return err 535 } 536 } 537 538 return nil 539 } 540 541 func (t *Tracer) debugEventsLoop(ctx context.Context) error { 542 rd, err := perf.NewReader(t.module.objects.DebugEvents, 2048) 543 if err != nil { 544 return fmt.Errorf("creating debug events perf reader: %w", err) 545 } 546 547 var e types.RawDebugEvent 548 for { 549 select { 550 case <-ctx.Done(): 551 return ctx.Err() 552 default: 553 } 554 555 v, err := rd.Read() 556 if err != nil { 557 if errors.Is(err, perf.ErrClosed) { 558 return nil 559 } 560 continue 561 } 562 563 if v.LostSamples > 0 { 564 t.log.Warnf("lost samples %d", v.LostSamples) 565 } 566 if len(v.RawSample) == 0 { 567 continue 568 } 569 if err := binary.Read(bytes.NewBuffer(v.RawSample), binary.LittleEndian, &e); err != nil { 570 return fmt.Errorf("read event binary: %w", err) 571 } 572 573 msg := e.String() 574 fmt.Printf("%s\n", msg) 575 } 576 } 577 578 func (t *Tracer) allowedByPolicyPre(ctx *types.EventContext) error { 579 policy := t.getPolicy(ctx.EventID, ctx.CgroupID) 580 581 if policy != nil { 582 return policy.allowPre(ctx) 583 } 584 585 // No policy. 586 return nil 587 } 588 589 func (t *Tracer) allowedByPolicy(eventID events.ID, cgroupID uint64, event *types.Event) error { 590 policy := t.getPolicy(eventID, cgroupID) 591 592 if policy != nil { 593 return policy.allow(event) 594 } 595 596 // No policy. 597 return nil 598 } 599 600 func (t *Tracer) getPolicy(eventID events.ID, cgroupID uint64) *cgroupEventPolicy { 601 t.policyMu.Lock() 602 defer t.policyMu.Unlock() 603 604 eventPolicy, found := t.eventPoliciesMap[eventID] 605 if found { 606 cgPolicyMap, found := t.cgroupEventPolicy[cgroupID] 607 608 if !found { 609 cgPolicyMap = make(map[events.ID]*cgroupEventPolicy) 610 t.cgroupEventPolicy[cgroupID] = cgPolicyMap 611 } 612 613 cgPolicy, found := cgPolicyMap[eventID] 614 615 if !found { 616 cgPolicy = newCgroupEventPolicy(eventPolicy) 617 t.cgroupEventPolicy[cgroupID][eventID] = cgPolicy 618 } 619 return cgPolicy 620 } 621 622 return nil 623 } 624 625 func (t *Tracer) cgroupCleanupLoop(ctx context.Context) error { 626 cleanupTimer := time.NewTicker(t.cleanupTimerTickRate) 627 defer func() { 628 cleanupTimer.Stop() 629 }() 630 631 for { 632 select { 633 case <-ctx.Done(): 634 return ctx.Err() 635 case <-cleanupTimer.C: 636 } 637 638 now := time.Now() 639 var toCleanup []cgroupCleanupRequest 640 641 t.cgroupCleanupMu.Lock() 642 toCleanup, t.requestedCgroupCleanups = splitCleanupRequests(now, t.requestedCgroupCleanups) 643 t.cgroupCleanupMu.Unlock() 644 645 cgroupsToCleanup := lo.Map(toCleanup, func(item cgroupCleanupRequest, index int) cgroup.ID { 646 return item.cgroupID 647 }) 648 t.removeCgroups(cgroupsToCleanup) 649 } 650 } 651 652 // splitCleanupRequests will split the given slice by the first index that is after the provided `now`. The provided 653 // requests need to be sorted by cleanup date. 654 func splitCleanupRequests(now time.Time, requests []cgroupCleanupRequest) ([]cgroupCleanupRequest, []cgroupCleanupRequest) { 655 splitIdx := len(requests) 656 // Requests have to be orderd by cleanup date. 657 for i, r := range requests { 658 if now.Before(r.cleanupAfter) { 659 splitIdx = i 660 break 661 } 662 } 663 664 return requests[:splitIdx], requests[splitIdx:] 665 } 666 667 func (t *Tracer) queueCgroupForRemoval(cgroupID cgroup.ID) { 668 t.cgroupCleanupMu.Lock() 669 t.requestedCgroupCleanups = append(t.requestedCgroupCleanups, cgroupCleanupRequest{ 670 cgroupID: cgroupID, 671 cleanupAfter: time.Now().Add(t.cgroupCleanupDelay), 672 }) 673 t.cgroupCleanupMu.Unlock() 674 } 675 676 func (t *Tracer) removeCgroups(cgroupIDs []cgroup.ID) { 677 t.policyMu.Lock() 678 t.removedCgroupsMu.Lock() 679 for _, id := range cgroupIDs { 680 delete(t.cgroupEventPolicy, id) 681 t.removedCgroups[id] = struct{}{} 682 } 683 t.policyMu.Unlock() 684 t.removedCgroupsMu.Unlock() 685 686 for _, id := range cgroupIDs { 687 t.cfg.ContainerClient.CleanupCgroup(id) 688 t.cfg.CgroupClient.CleanupCgroup(id) 689 } 690 }