github.com/inspektor-gadget/inspektor-gadget@v0.28.1/pkg/runcfanotify/runcfanotify.go (about) 1 // Copyright 2021 The Inspektor Gadget authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package runcfanotify 16 17 import ( 18 "encoding/json" 19 "errors" 20 "fmt" 21 "io" 22 "math" 23 "os" 24 "path" 25 "path/filepath" 26 "strconv" 27 "strings" 28 "sync" 29 "sync/atomic" 30 "time" 31 32 ocispec "github.com/opencontainers/runtime-spec/specs-go" 33 "github.com/s3rj1k/go-fanotify/fanotify" 34 log "github.com/sirupsen/logrus" 35 "golang.org/x/sys/unix" 36 37 "github.com/inspektor-gadget/inspektor-gadget/pkg/utils/host" 38 ) 39 40 type EventType int 41 42 const ( 43 EventTypeAddContainer EventType = iota 44 EventTypeRemoveContainer 45 ) 46 47 // ContainerEvent is the notification for container creation or termination 48 type ContainerEvent struct { 49 // Type is whether the container was added or removed 50 Type EventType 51 52 // ContainerID is the container id, typically a 64 hexadecimal string 53 ContainerID string 54 55 // ContainerName is the container name given by the container runtime, 56 // typically two words with an underscore. Notice it might be different from 57 // the one given by Kubernetes. 58 ContainerName string 59 60 // ContainerPID is the process id of the container 61 ContainerPID uint32 62 63 // Container's configuration is the config.json from the OCI runtime 64 // spec 65 ContainerConfig *ocispec.Spec 66 67 // Bundle is the directory containing the config.json from the OCI 68 // runtime spec 69 // See https://github.com/opencontainers/runtime-spec/blob/main/bundle.md 70 Bundle string 71 } 72 73 type RuncNotifyFunc func(notif ContainerEvent) 74 75 type runcContainer struct { 76 id string 77 pid int 78 pidfd int 79 } 80 81 type futureContainer struct { 82 id string 83 name string 84 bundleDir string 85 pidFile string 86 } 87 88 type RuncNotifier struct { 89 runcBinaryNotify *fanotify.NotifyFD 90 callback RuncNotifyFunc 91 92 // containers is the set of containers that are being watched for 93 // termination. This prevents duplicate calls to 94 // AddWatchContainerTermination. 95 // 96 // Keys: Container ID 97 containers map[string]*runcContainer 98 containersMu sync.Mutex 99 100 // futureContainers is the set of containers that are detected before 101 // oci-runtime (runc/crun) creates the container e.g. detected via conmon 102 // 103 // Keys: Container ID 104 futureContainers map[string]*futureContainer 105 futureMu sync.Mutex 106 107 // set to true when RuncNotifier is closed 108 closed atomic.Bool 109 // this channel is used in watchContainersTermination() to avoid having to wait for the 110 // ticker to trigger before returning 111 done chan bool 112 113 wg sync.WaitGroup 114 } 115 116 // runcPaths is the list of paths where runc could be installed. Depending on 117 // the Linux distribution, it could be in different locations. 118 // 119 // When this package is executed in a container, it prepends the 120 // HOST_ROOT env variable to the path. 121 var runcPaths = []string{ 122 "/bin/runc", 123 "/usr/bin/runc", 124 "/usr/sbin/runc", 125 "/usr/local/bin/runc", 126 "/usr/local/sbin/runc", 127 "/usr/lib/cri-o-runc/sbin/runc", 128 "/run/torcx/unpack/docker/bin/runc", 129 "/usr/bin/crun", 130 } 131 132 // initFanotify initializes the fanotify API with the flags we need 133 func initFanotify() (*fanotify.NotifyFD, error) { 134 fanotifyFlags := uint(unix.FAN_CLOEXEC | unix.FAN_CLASS_CONTENT | unix.FAN_UNLIMITED_QUEUE | unix.FAN_UNLIMITED_MARKS | unix.FAN_NONBLOCK) 135 openFlags := os.O_RDONLY | unix.O_LARGEFILE | unix.O_CLOEXEC 136 return fanotify.Initialize(fanotifyFlags, openFlags) 137 } 138 139 // Supported detects if RuncNotifier is supported in the current environment 140 func Supported() bool { 141 hostPidNs, err := host.IsHostPidNs() 142 if err != nil { 143 log.Debugf("Runcfanotify: not supported: %s", err) 144 return false 145 } 146 if !hostPidNs { 147 log.Debugf("Runcfanotify: not supported: not in host pid namespace") 148 return false 149 } 150 notifier, err := NewRuncNotifier(func(notif ContainerEvent) {}) 151 if notifier != nil { 152 notifier.Close() 153 } 154 if err != nil { 155 log.Warnf("checking if current pid namespace is host pid namespace %s", err) 156 } 157 return err == nil 158 } 159 160 // NewRuncNotifier uses fanotify to detect when runc containers are created 161 // or terminated, and call the callback on such event. 162 // 163 // Limitations: 164 // - runc must be installed in one of the paths listed by runcPaths 165 func NewRuncNotifier(callback RuncNotifyFunc) (*RuncNotifier, error) { 166 n := &RuncNotifier{ 167 callback: callback, 168 containers: make(map[string]*runcContainer), 169 futureContainers: make(map[string]*futureContainer), 170 done: make(chan bool), 171 } 172 173 runcBinaryNotify, err := initFanotify() 174 if err != nil { 175 return nil, err 176 } 177 n.runcBinaryNotify = runcBinaryNotify 178 179 runcMonitored := false 180 181 runcPath := os.Getenv("RUNC_PATH") 182 if runcPath != "" { 183 log.Debugf("Runcfanotify: trying runc from RUNC_PATH env variable at %s", runcPath) 184 185 if _, err := os.Stat(runcPath); errors.Is(err, os.ErrNotExist) { 186 return nil, err 187 } 188 189 if err := runcBinaryNotify.Mark(unix.FAN_MARK_ADD, unix.FAN_OPEN_EXEC_PERM, unix.AT_FDCWD, runcPath); err != nil { 190 return nil, fmt.Errorf("fanotify marking of %s: %w", runcPath, err) 191 } 192 runcMonitored = true 193 } else { 194 for _, r := range runcPaths { 195 runcPath := filepath.Join(host.HostRoot, r) 196 197 log.Debugf("Runcfanotify: trying runc at %s", runcPath) 198 199 if _, err := os.Stat(runcPath); errors.Is(err, os.ErrNotExist) { 200 log.Debugf("Runcfanotify: runc at %s not found", runcPath) 201 continue 202 } 203 204 if err := runcBinaryNotify.Mark(unix.FAN_MARK_ADD, unix.FAN_OPEN_EXEC_PERM, unix.AT_FDCWD, runcPath); err != nil { 205 log.Warnf("Runcfanotify: failed to fanotify mark: %s", err) 206 continue 207 } 208 runcMonitored = true 209 } 210 } 211 212 if !runcMonitored { 213 runcBinaryNotify.File.Close() 214 return nil, fmt.Errorf("no runc instance can be monitored with fanotify. The following paths were tested: %s. You can use the RUNC_PATH env variable to specify a custom path. If you are successful doing so, please open a PR to add your custom path to runcPaths", strings.Join(runcPaths, ",")) 215 } 216 217 n.wg.Add(2) 218 go n.watchContainersTermination() 219 go n.watchRunc() 220 221 return n, nil 222 } 223 224 // AddWatchContainerTermination watches a container for termination and 225 // generates an event on the notifier. This is automatically called for new 226 // containers detected by RuncNotifier, but it can also be called for 227 // containers detected externally such as initial containers. 228 func (n *RuncNotifier) AddWatchContainerTermination(containerID string, containerPID int) error { 229 n.containersMu.Lock() 230 defer n.containersMu.Unlock() 231 232 if _, ok := n.containers[containerID]; ok { 233 // This container is already being watched for termination 234 return nil 235 } 236 237 n.containers[containerID] = &runcContainer{ 238 id: containerID, 239 pid: containerPID, 240 } 241 242 return nil 243 } 244 245 // watchContainerTermination waits until the container terminates 246 func (n *RuncNotifier) watchContainersTermination() { 247 defer n.wg.Done() 248 249 ticker := time.NewTicker(time.Second) 250 defer ticker.Stop() 251 252 for { 253 select { 254 case <-n.done: 255 return 256 case <-ticker.C: 257 if n.closed.Load() { 258 return 259 } 260 261 dirEntries, err := os.ReadDir(host.HostProcFs) 262 if err != nil { 263 log.Errorf("reading /proc: %s", err) 264 return 265 } 266 pids := make(map[int]bool) 267 for _, entry := range dirEntries { 268 pid, err := strconv.Atoi(entry.Name()) 269 if err != nil { 270 // entry is not a process directory. Ignore. 271 continue 272 } 273 pids[pid] = true 274 } 275 276 n.containersMu.Lock() 277 for _, c := range n.containers { 278 if pids[c.pid] { 279 // container still running 280 continue 281 } 282 283 go n.callback(ContainerEvent{ 284 Type: EventTypeRemoveContainer, 285 ContainerID: c.id, 286 ContainerPID: uint32(c.pid), 287 }) 288 289 delete(n.containers, c.id) 290 } 291 n.containersMu.Unlock() 292 } 293 } 294 } 295 296 func (n *RuncNotifier) watchPidFileIterate(pidFileDirNotify *fanotify.NotifyFD, bundleDir string, pidFile string, pidFileDir string) (bool, error) { 297 // Get the next event from fanotify. 298 // Even though the API allows to pass skipPIDs, we cannot use 299 // it here because ResponseAllow would not be called. 300 data, err := pidFileDirNotify.GetEvent() 301 if err != nil { 302 return false, fmt.Errorf("%w", err) 303 } 304 305 // data can be nil if the event received is from a process in skipPIDs. 306 // In that case, skip and get the next event. 307 if data == nil { 308 return false, nil 309 } 310 311 // Don't leak the fd received by GetEvent 312 defer data.Close() 313 dataFile := data.File() 314 defer dataFile.Close() 315 316 if !data.MatchMask(unix.FAN_ACCESS_PERM) { 317 // This should not happen: FAN_ACCESS_PERM is the only mask Marked 318 return false, fmt.Errorf("fanotify: unknown event on runc: mask=%d pid=%d", data.Mask, data.Pid) 319 } 320 321 // This unblocks whoever is accessing the pidfile 322 defer pidFileDirNotify.ResponseAllow(data) 323 324 pid := data.GetPID() 325 326 // Skip events triggered by ourselves 327 if pid == os.Getpid() { 328 return false, nil 329 } 330 331 path, err := data.GetPath() 332 if err != nil { 333 return false, err 334 } 335 path = filepath.Join(host.HostRoot, path) 336 337 // Consider files identical if they have the same device/inode, 338 // even if the paths differ due to symlinks (for example, 339 // the event's path is /run/... but the runc --pid-file argument 340 // uses /var/run/..., where /var/run is a symlink to /run). 341 filesAreIdentical, err := checkFilesAreIdentical(path, pidFile) 342 if err != nil { 343 return false, err 344 } else if !filesAreIdentical { 345 return false, nil 346 } 347 348 pidFileContent, err := io.ReadAll(dataFile) 349 if err != nil { 350 return false, err 351 } 352 if len(pidFileContent) == 0 { 353 return false, fmt.Errorf("empty pid file") 354 } 355 containerPID, err := strconv.Atoi(string(pidFileContent)) 356 if err != nil { 357 return false, err 358 } 359 360 // Unfortunately, Linux 5.4 doesn't respect ignore masks 361 // See fix in Linux 5.9: 362 // https://github.com/torvalds/linux/commit/497b0c5a7c0688c1b100a9c2e267337f677c198e 363 // Workaround: remove parent mask. We don't need it anymore :) 364 err = pidFileDirNotify.Mark(unix.FAN_MARK_REMOVE, unix.FAN_ACCESS_PERM|unix.FAN_EVENT_ON_CHILD, unix.AT_FDCWD, pidFileDir) 365 if err != nil { 366 return false, nil 367 } 368 369 bundleConfigJSON, err := os.ReadFile(filepath.Join(bundleDir, "config.json")) 370 if err != nil { 371 return false, err 372 } 373 containerConfig := &ocispec.Spec{} 374 err = json.Unmarshal(bundleConfigJSON, containerConfig) 375 if err != nil { 376 return false, err 377 } 378 379 // cri-o appends userdata to bundleDir, 380 // so we trim it here to get the correct containerID 381 containerID := filepath.Base(filepath.Clean(strings.TrimSuffix(bundleDir, "userdata"))) 382 383 err = n.AddWatchContainerTermination(containerID, containerPID) 384 if err != nil { 385 log.Errorf("runc fanotify: container %s with pid %d terminated before we could watch it: %s", containerID, containerPID, err) 386 return true, nil 387 } 388 389 if containerPID > math.MaxUint32 { 390 log.Errorf("Container PID (%d) exceeds math.MaxUint32 (%d)", containerPID, math.MaxUint32) 391 return true, nil 392 } 393 394 var containerName string 395 if fc := n.lookupFutureContainer(containerID); fc != nil { 396 containerName = fc.name 397 } 398 399 n.callback(ContainerEvent{ 400 Type: EventTypeAddContainer, 401 ContainerID: containerID, 402 ContainerPID: uint32(containerPID), 403 ContainerConfig: containerConfig, 404 Bundle: bundleDir, 405 ContainerName: containerName, 406 }) 407 408 return true, nil 409 } 410 411 func checkFilesAreIdentical(path1, path2 string) (bool, error) { 412 // Since fanotify masks don't work on Linux 5.4, we could get a 413 // notification for an unrelated file before the pid file is created 414 // See fix in Linux 5.9: 415 // https://github.com/torvalds/linux/commit/497b0c5a7c0688c1b100a9c2e267337f677c198e 416 // In this case we should not return an error. 417 if filepath.Base(path1) != filepath.Base(path2) { 418 return false, nil 419 } 420 421 f1, err := os.Stat(path1) 422 if err != nil { 423 return false, err 424 } 425 426 f2, err := os.Stat(path2) 427 if err != nil { 428 return false, err 429 } 430 431 return os.SameFile(f1, f2), nil 432 } 433 434 func (n *RuncNotifier) monitorRuncInstance(bundleDir string, pidFile string) error { 435 fanotifyFlags := uint(unix.FAN_CLOEXEC | unix.FAN_CLASS_CONTENT | unix.FAN_UNLIMITED_QUEUE | unix.FAN_UNLIMITED_MARKS) 436 openFlags := os.O_RDONLY | unix.O_LARGEFILE | unix.O_CLOEXEC 437 438 pidFileDirNotify, err := fanotify.Initialize(fanotifyFlags, openFlags) 439 if err != nil { 440 return err 441 } 442 443 // The pidfile does not exist yet, so we cannot monitor it directly. 444 // Instead we monitor its parent directory with FAN_EVENT_ON_CHILD to 445 // get events on the directory's children. 446 pidFileDir := filepath.Dir(pidFile) 447 err = pidFileDirNotify.Mark(unix.FAN_MARK_ADD, unix.FAN_ACCESS_PERM|unix.FAN_EVENT_ON_CHILD, unix.AT_FDCWD, pidFileDir) 448 if err != nil { 449 pidFileDirNotify.File.Close() 450 return fmt.Errorf("marking %s: %w", pidFileDir, err) 451 } 452 453 // watchPidFileIterate() will read config.json and it might be in the 454 // same directory as the pid file. To avoid getting events unrelated to 455 // the pidfile, add an ignore mask. 456 // 457 // This is best effort because the ignore mask is unfortunately not 458 // respected until a fix in Linux 5.9: 459 // https://github.com/torvalds/linux/commit/497b0c5a7c0688c1b100a9c2e267337f677c198e 460 configJSONPath := filepath.Join(bundleDir, "config.json") 461 err = pidFileDirNotify.Mark(unix.FAN_MARK_ADD|unix.FAN_MARK_IGNORED_MASK, unix.FAN_ACCESS_PERM, unix.AT_FDCWD, configJSONPath) 462 if err != nil { 463 pidFileDirNotify.File.Close() 464 return fmt.Errorf("ignoring %s: %w", configJSONPath, err) 465 } 466 467 // similar to config.json, we ignore passwd file if it exists 468 passwdPath := filepath.Join(bundleDir, "passwd") 469 if _, err := os.Stat(passwdPath); !errors.Is(err, os.ErrNotExist) { 470 err = pidFileDirNotify.Mark(unix.FAN_MARK_ADD|unix.FAN_MARK_IGNORED_MASK, unix.FAN_ACCESS_PERM, unix.AT_FDCWD, passwdPath) 471 if err != nil { 472 pidFileDirNotify.File.Close() 473 return fmt.Errorf("marking passwd path: %w", err) 474 } 475 } 476 477 n.wg.Add(1) 478 go func() { 479 defer n.wg.Done() 480 defer pidFileDirNotify.File.Close() 481 for { 482 stop, err := n.watchPidFileIterate(pidFileDirNotify, bundleDir, pidFile, pidFileDir) 483 if n.closed.Load() { 484 return 485 } 486 if err != nil { 487 log.Warnf("error watching pid: %v\n", err) 488 return 489 } 490 if stop { 491 return 492 } 493 } 494 }() 495 496 return nil 497 } 498 499 func (n *RuncNotifier) watchRunc() { 500 defer n.wg.Done() 501 502 for { 503 stop, err := n.watchRuncIterate() 504 if n.closed.Load() { 505 n.runcBinaryNotify.File.Close() 506 return 507 } 508 if err != nil { 509 log.Errorf("error watching runc: %v\n", err) 510 } 511 if stop { 512 n.runcBinaryNotify.File.Close() 513 return 514 } 515 } 516 } 517 518 func (n *RuncNotifier) parseConmonCmdline(cmdlineArr []string) { 519 if path.Base(cmdlineArr[0]) != "conmon" { 520 return 521 } 522 523 // Parse conmon command line 524 containerName := "" 525 containerID := "" 526 bundleDir := "" 527 pidFile := "" 528 conmonFound := false 529 530 conmonFound = true 531 for i := 0; i < len(cmdlineArr); i++ { 532 verb := cmdlineArr[i] 533 arg := "" 534 if i+1 < len(cmdlineArr) { 535 arg = cmdlineArr[i+1] 536 } 537 switch verb { 538 case "-n", "--name": 539 containerName = arg 540 i++ 541 case "-c", "--cid": 542 containerID = arg 543 i++ 544 case "-b", "--bundle": 545 bundleDir = arg 546 i++ 547 case "-p", "--container-pidfile": 548 pidFile = arg 549 i++ 550 } 551 } 552 553 if !conmonFound || containerName == "" || containerID == "" || bundleDir == "" || pidFile == "" { 554 return 555 } 556 557 n.futureMu.Lock() 558 n.futureContainers[containerID] = &futureContainer{ 559 id: containerID, 560 pidFile: pidFile, 561 bundleDir: bundleDir, 562 name: containerName, 563 } 564 n.futureMu.Unlock() 565 } 566 567 func (n *RuncNotifier) parseOCIRuntime(comm string, cmdlineArr []string) { 568 // Parse oci-runtime (runc/crun) command line 569 createFound := false 570 startFound := false 571 containerID := "" 572 bundleDir := "" 573 pidFile := "" 574 575 for i := 0; i < len(cmdlineArr); i++ { 576 if cmdlineArr[i] == "create" { 577 createFound = true 578 continue 579 } 580 if cmdlineArr[i] == "start" { 581 startFound = true 582 continue 583 } 584 if cmdlineArr[i] == "--bundle" && i+1 < len(cmdlineArr) { 585 i++ 586 bundleDir = filepath.Join(host.HostRoot, cmdlineArr[i]) 587 continue 588 } 589 if cmdlineArr[i] == "--pid-file" && i+1 < len(cmdlineArr) { 590 i++ 591 pidFile = filepath.Join(host.HostRoot, cmdlineArr[i]) 592 continue 593 } 594 if cmdlineArr[i] != "" { 595 containerID = cmdlineArr[i] 596 } 597 } 598 599 if comm == "runc" && createFound && bundleDir != "" && pidFile != "" { 600 err := n.monitorRuncInstance(bundleDir, pidFile) 601 if err != nil { 602 log.Errorf("error monitoring runc instance: %v\n", err) 603 } 604 } 605 606 if comm == "crun" && startFound && containerID != "" { 607 fc := n.lookupFutureContainer(containerID) 608 if fc == nil { 609 log.Warnf("cannot lookup container for %s\n", containerID) 610 return 611 } 612 bundleConfigJSON, err := os.ReadFile(filepath.Join(fc.bundleDir, "config.json")) 613 if err != nil { 614 log.Errorf("error reading bundle config: %v\n", err) 615 return 616 } 617 containerConfig := &ocispec.Spec{} 618 err = json.Unmarshal(bundleConfigJSON, containerConfig) 619 if err != nil { 620 log.Errorf("error unmarshaling bundle config: %v\n", err) 621 return 622 } 623 624 pidFileContent, err := os.ReadFile(fc.pidFile) 625 if err != nil { 626 log.Errorf("error reading pid file: %v\n", err) 627 return 628 } 629 if len(pidFileContent) == 0 { 630 log.Errorf("empty pid file") 631 return 632 } 633 containerPID, err := strconv.ParseUint(string(pidFileContent), 10, 32) 634 if err != nil { 635 log.Errorf("error parsing pid file: %v\n", err) 636 return 637 } 638 639 n.callback(ContainerEvent{ 640 Type: EventTypeAddContainer, 641 ContainerID: containerID, 642 ContainerPID: uint32(containerPID), 643 ContainerConfig: containerConfig, 644 Bundle: bundleDir, 645 ContainerName: fc.name, 646 }) 647 } 648 } 649 650 func (n *RuncNotifier) watchRuncIterate() (bool, error) { 651 // Get the next event from fanotify. 652 // Even though the API allows to pass skipPIDs, we cannot use it here 653 // because ResponseAllow would not be called. 654 data, err := n.runcBinaryNotify.GetEvent() 655 if err != nil { 656 return true, fmt.Errorf("%w", err) 657 } 658 659 // data can be nil if the event received is from a process in skipPIDs. 660 // In that case, skip and get the next event. 661 if data == nil { 662 return false, nil 663 } 664 665 // Don't leak the fd received by GetEvent 666 defer data.Close() 667 668 if !data.MatchMask(unix.FAN_OPEN_EXEC_PERM) { 669 // This should not happen: FAN_OPEN_EXEC_PERM is the only mask Marked 670 return false, fmt.Errorf("fanotify: unknown event on runc: mask=%d pid=%d", data.Mask, data.Pid) 671 } 672 673 // This unblocks the execution 674 defer n.runcBinaryNotify.ResponseAllow(data) 675 676 pid := data.GetPID() 677 678 // Skip events triggered by ourselves 679 if pid == os.Getpid() { 680 return false, nil 681 } 682 683 // runc is executing itself with unix.Exec(), so fanotify receives two 684 // FAN_OPEN_EXEC_PERM events: 685 // 1. from containerd-shim (or similar) 686 // 2. from runc, by this re-execution. 687 // This filter skips the first one and handles the second one. 688 comm := host.GetProcComm(pid) 689 cmdlineArr := host.GetProcCmdline(pid) 690 691 if len(cmdlineArr) == 0 { 692 return false, nil 693 } 694 695 switch comm { 696 case "conmon": 697 // conmon is a special case because it is not a child of the container 698 // Also, the calling sequence is podman -> conmon -> runc 699 n.parseConmonCmdline(cmdlineArr) 700 case "runc", "crun": 701 n.parseOCIRuntime(comm, cmdlineArr) 702 default: 703 return false, nil 704 } 705 706 return false, nil 707 } 708 709 func (n *RuncNotifier) Close() { 710 n.closed.Store(true) 711 close(n.done) 712 n.runcBinaryNotify.File.Close() 713 n.wg.Wait() 714 } 715 716 func (n *RuncNotifier) lookupFutureContainer(id string) *futureContainer { 717 n.futureMu.Lock() 718 defer n.futureMu.Unlock() 719 fc, ok := n.futureContainers[id] 720 if !ok { 721 return nil 722 } 723 delete(n.futureContainers, id) 724 return fc 725 }