github.com/rootless-containers/rootlesskit/v2@v2.3.4/pkg/parent/parent.go (about) 1 package parent 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "net" 8 "net/http" 9 "os" 10 "os/exec" 11 "os/user" 12 "path/filepath" 13 "strconv" 14 "syscall" 15 16 "github.com/gofrs/flock" 17 "github.com/gorilla/mux" 18 "github.com/rootless-containers/rootlesskit/v2/pkg/api/router" 19 "github.com/rootless-containers/rootlesskit/v2/pkg/messages" 20 "github.com/rootless-containers/rootlesskit/v2/pkg/network" 21 "github.com/rootless-containers/rootlesskit/v2/pkg/parent/cgrouputil" 22 "github.com/rootless-containers/rootlesskit/v2/pkg/parent/dynidtools" 23 "github.com/rootless-containers/rootlesskit/v2/pkg/parent/idtools" 24 "github.com/rootless-containers/rootlesskit/v2/pkg/port" 25 "github.com/rootless-containers/rootlesskit/v2/pkg/sigproxy" 26 "github.com/rootless-containers/rootlesskit/v2/pkg/sigproxy/signal" 27 "github.com/sirupsen/logrus" 28 "golang.org/x/sys/unix" 29 ) 30 31 type Opt struct { 32 PipeFDEnvKey string // needs to be set 33 ChildUseActivationEnvKey string // needs to be set 34 StateDir string // directory needs to be precreated 35 StateDirEnvKey string // optional env key to propagate StateDir value 36 NetworkDriver network.ParentDriver // nil for HostNetwork 37 PortDriver port.ParentDriver // nil for --port-driver=none 38 PublishPorts []port.Spec 39 CreatePIDNS bool 40 CreateCgroupNS bool 41 CreateUTSNS bool 42 CreateIPCNS bool 43 DetachNetNS bool 44 ParentEUIDEnvKey string // optional env key to propagate geteuid() value 45 ParentEGIDEnvKey string // optional env key to propagate getegid() value 46 Propagation string 47 EvacuateCgroup2 string // e.g. "rootlesskit_evacuation" 48 SubidSource SubidSource 49 } 50 51 type SubidSource string 52 53 const ( 54 SubidSourceAuto = SubidSource("auto") // Try dynamic then fallback to static 55 SubidSourceDynamic = SubidSource("dynamic") // /usr/bin/getsubids 56 SubidSourceStatic = SubidSource("static") // /etc/{subuid,subgid} 57 ) 58 59 // Documented state files. Undocumented ones are subject to change. 60 const ( 61 StateFileLock = "lock" 62 StateFileChildPID = "child_pid" // decimal pid number text 63 StateFileAPISock = "api.sock" // REST API Socket 64 StateFileNetNs = "netns" // rootlesskit network namespace 65 ) 66 67 func checkPreflight(opt Opt) error { 68 if opt.PipeFDEnvKey == "" { 69 return errors.New("pipe FD env key is not set") 70 } 71 if opt.StateDir == "" { 72 return errors.New("state dir is not set") 73 } 74 if !filepath.IsAbs(opt.StateDir) { 75 return errors.New("state dir must be absolute") 76 } 77 if stat, err := os.Stat(opt.StateDir); err != nil || !stat.IsDir() { 78 return fmt.Errorf("state dir is inaccessible: %w", err) 79 } 80 81 if os.Geteuid() == 0 { 82 logrus.Warn("Running RootlessKit as the root user is unsupported.") 83 } 84 85 warnSysctl() 86 87 // invalid propagation doesn't result in an error 88 warnPropagation(opt.Propagation) 89 return nil 90 } 91 92 // createCleanupLock uses LOCK_SH for preventing automatic cleanup of 93 // "/tmp/<Our State Dir>" caused by by systemd. 94 // 95 // This LOCK_SH lock is different from our lock file in the state dir. 96 // We could unify the lock file into LOCK_SH, but we are still keeping 97 // the lock file for a historical reason. 98 // 99 // See: 100 // - https://github.com/rootless-containers/rootlesskit/issues/185 101 // - https://github.com/rootless-containers/rootlesskit/pull/188 102 func createCleanupLock(sDir string) error { 103 //lock state dir when using /tmp/ path 104 stateDir, err := os.Open(sDir) 105 if err != nil { 106 return err 107 } 108 err = unix.Flock(int(stateDir.Fd()), unix.LOCK_SH) 109 if err != nil { 110 logrus.Warnf("Failed to lock the state dir %s", sDir) 111 } 112 return nil 113 } 114 115 // LockStateDir creates and locks "lock" file in the state dir. 116 func LockStateDir(stateDir string) (*flock.Flock, error) { 117 lockPath := filepath.Join(stateDir, StateFileLock) 118 lock := flock.New(lockPath) 119 locked, err := lock.TryLock() 120 if err != nil { 121 return nil, fmt.Errorf("failed to lock %s: %w", lockPath, err) 122 } 123 if !locked { 124 return nil, fmt.Errorf("failed to lock %s, another RootlessKit is running with the same state directory?", lockPath) 125 } 126 return lock, nil 127 } 128 129 func setupFilesAndEnv(readPipe *os.File, writePipe *os.File, opt Opt) ([]*os.File, []string) { 130 // 0 1 and 2 are used for stdin. stdout, and stderr 131 const listenFdsStart = 3 132 listenPid, listenPidErr := strconv.Atoi(os.Getenv("LISTEN_PID")) 133 listenFds, listenFdsErr := strconv.Atoi(os.Getenv("LISTEN_FDS")) 134 useSystemdSocketFDs := listenPidErr == nil && listenFdsErr == nil && listenFds > 0 135 if !useSystemdSocketFDs { 136 listenFds = 0 137 } 138 extraFiles := make([]*os.File, listenFds+2) 139 for i, fd := 0, listenFdsStart; i < listenFds; i, fd = i+1, fd+1 { 140 name := "LISTEN_FD_" + strconv.Itoa(fd) 141 extraFiles[i] = os.NewFile(uintptr(fd), name) 142 } 143 extraFiles[listenFds] = readPipe 144 extraFiles[listenFds+1] = writePipe 145 cmdEnv := os.Environ() 146 cmdEnv = append(cmdEnv, opt.PipeFDEnvKey+"="+strconv.Itoa(listenFdsStart+listenFds)+","+strconv.Itoa(listenFdsStart+listenFds+1)) 147 cmdEnv = append(cmdEnv, opt.ChildUseActivationEnvKey+"="+strconv.FormatBool(listenPid == os.Getpid())) 148 return extraFiles, cmdEnv 149 } 150 151 func Parent(opt Opt) error { 152 if err := checkPreflight(opt); err != nil { 153 return err 154 } 155 156 err := createCleanupLock(opt.StateDir) 157 if err != nil { 158 return err 159 } 160 161 lock, err := LockStateDir(opt.StateDir) 162 if err != nil { 163 return err 164 } 165 defer os.RemoveAll(opt.StateDir) 166 defer lock.Unlock() 167 168 pipeR, pipeW, err := os.Pipe() // parent-to-child 169 if err != nil { 170 return err 171 } 172 pipe2R, pipe2W, err := os.Pipe() // child-to-parent 173 if err != nil { 174 return err 175 } 176 cmd := exec.Command("/proc/self/exe", os.Args[1:]...) 177 cmd.SysProcAttr = &syscall.SysProcAttr{ 178 Pdeathsig: syscall.SIGKILL, 179 Cloneflags: syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS, 180 } 181 182 if opt.NetworkDriver != nil { 183 if !opt.DetachNetNS { 184 cmd.SysProcAttr.Unshareflags |= syscall.CLONE_NEWNET 185 } 186 } 187 188 if opt.CreatePIDNS { 189 // cannot be Unshareflags (panics) 190 cmd.SysProcAttr.Cloneflags |= syscall.CLONE_NEWPID 191 } 192 if opt.CreateCgroupNS { 193 cmd.SysProcAttr.Unshareflags |= unix.CLONE_NEWCGROUP 194 } 195 if opt.CreateUTSNS { 196 cmd.SysProcAttr.Unshareflags |= unix.CLONE_NEWUTS 197 } 198 if opt.CreateIPCNS { 199 cmd.SysProcAttr.Unshareflags |= unix.CLONE_NEWIPC 200 } 201 cmd.Stdin = os.Stdin 202 cmd.Stdout = os.Stdout 203 cmd.Stderr = os.Stderr 204 cmd.ExtraFiles, cmd.Env = setupFilesAndEnv(pipeR, pipe2W, opt) 205 if opt.StateDirEnvKey != "" { 206 cmd.Env = append(cmd.Env, opt.StateDirEnvKey+"="+opt.StateDir) 207 } 208 if opt.ParentEUIDEnvKey != "" { 209 cmd.Env = append(cmd.Env, fmt.Sprintf("%s=%d", opt.ParentEUIDEnvKey, os.Geteuid())) 210 } 211 if opt.ParentEGIDEnvKey != "" { 212 cmd.Env = append(cmd.Env, fmt.Sprintf("%s=%d", opt.ParentEGIDEnvKey, os.Getegid())) 213 } 214 if err := cmd.Start(); err != nil { 215 warnOnChildStartFailure(err) 216 return fmt.Errorf("failed to start the child: %w", err) 217 } 218 219 msgParentHello := &messages.Message{ 220 U: messages.U{ 221 ParentHello: &messages.ParentHello{}, 222 }, 223 } 224 if err := messages.Send(pipeW, msgParentHello); err != nil { 225 return err 226 } 227 if _, err := messages.WaitFor(pipe2R, messages.Name(messages.ChildHello{})); err != nil { 228 return err 229 } 230 231 if err := setupUIDGIDMap(cmd.Process.Pid, opt.SubidSource); err != nil { 232 return fmt.Errorf("failed to setup UID/GID map: %w", err) 233 } 234 msgParentInitIdmapCompleted := &messages.Message{ 235 U: messages.U{ 236 ParentInitIdmapCompleted: &messages.ParentInitIdmapCompleted{}, 237 }, 238 } 239 if err := messages.Send(pipeW, msgParentInitIdmapCompleted); err != nil { 240 return err 241 } 242 if _, err := messages.WaitFor(pipe2R, messages.Name(messages.ChildInitUserNSCompleted{})); err != nil { 243 return err 244 } 245 246 sigc := sigproxy.ForwardAllSignals(context.TODO(), cmd.Process.Pid) 247 defer signal.StopCatch(sigc) 248 249 if opt.EvacuateCgroup2 != "" { 250 if err := cgrouputil.EvacuateCgroup2(opt.EvacuateCgroup2); err != nil { 251 return err 252 } 253 } 254 255 // configure Network driver 256 msgParentInitNetworkDriverCompleted := &messages.Message{ 257 U: messages.U{ 258 ParentInitNetworkDriverCompleted: &messages.ParentInitNetworkDriverCompleted{}, 259 }, 260 } 261 262 if opt.NetworkDriver != nil { 263 var netns string 264 if opt.DetachNetNS { 265 netns = filepath.Join("/proc", strconv.Itoa(cmd.Process.Pid), "root", filepath.Clean(opt.StateDir), "netns") 266 } 267 netMsg, cleanupNetwork, err := opt.NetworkDriver.ConfigureNetwork(cmd.Process.Pid, opt.StateDir, netns) 268 if cleanupNetwork != nil { 269 defer cleanupNetwork() 270 } 271 if err != nil { 272 return fmt.Errorf("failed to setup network %+v: %w", opt.NetworkDriver, err) 273 } 274 msgParentInitNetworkDriverCompleted.U.ParentInitNetworkDriverCompleted = netMsg 275 } 276 if err := messages.Send(pipeW, msgParentInitNetworkDriverCompleted); err != nil { 277 return err 278 } 279 280 // configure Port driver 281 msgParentInitPortDriverCompleted := &messages.Message{ 282 U: messages.U{ 283 ParentInitPortDriverCompleted: &messages.ParentInitPortDriverCompleted{}, 284 }, 285 } 286 portDriverInitComplete := make(chan struct{}) 287 portDriverQuit := make(chan struct{}) 288 portDriverErr := make(chan error) 289 if opt.PortDriver != nil { 290 msgParentInitPortDriverCompleted.U.ParentInitPortDriverCompleted.PortDriverOpaque = opt.PortDriver.OpaqueForChild() 291 cctx := &port.ChildContext{ 292 IP: net.ParseIP(msgParentInitNetworkDriverCompleted.U.ParentInitNetworkDriverCompleted.IP).To4(), 293 } 294 go func() { 295 portDriverErr <- opt.PortDriver.RunParentDriver(portDriverInitComplete, 296 portDriverQuit, cctx) 297 }() 298 } 299 if err := messages.Send(pipeW, msgParentInitPortDriverCompleted); err != nil { 300 return err 301 } 302 303 // Close the parent-to-child pipe 304 if err := pipeW.Close(); err != nil { 305 return err 306 } 307 if opt.PortDriver != nil { 308 // wait for port driver to be ready 309 select { 310 case <-portDriverInitComplete: 311 case err = <-portDriverErr: 312 return err 313 } 314 // publish ports 315 for _, p := range opt.PublishPorts { 316 st, err := opt.PortDriver.AddPort(context.TODO(), p) 317 if err != nil { 318 return fmt.Errorf("failed to expose port %v: %w", p, err) 319 } 320 logrus.Debugf("published port %v", st) 321 } 322 } 323 324 // after child is fully configured, write PID to child_pid file 325 childPIDPath := filepath.Join(opt.StateDir, StateFileChildPID) 326 if err := os.WriteFile(childPIDPath, []byte(strconv.Itoa(cmd.Process.Pid)), 0444); err != nil { 327 return fmt.Errorf("failed to write the child PID %d to %s: %w", cmd.Process.Pid, childPIDPath, err) 328 } 329 // listens the API 330 apiSockPath := filepath.Join(opt.StateDir, StateFileAPISock) 331 apiCloser, err := listenServeAPI(apiSockPath, &router.Backend{ 332 StateDir: opt.StateDir, 333 ChildPID: cmd.Process.Pid, 334 NetworkDriver: opt.NetworkDriver, 335 PortDriver: opt.PortDriver, 336 }) 337 if err != nil { 338 return err 339 } 340 // block until the child exits 341 if err := cmd.Wait(); err != nil { 342 return fmt.Errorf("child exited: %w", err) 343 } 344 // close the API socket 345 if err := apiCloser.Close(); err != nil { 346 return fmt.Errorf("failed to close %s: %w", apiSockPath, err) 347 } 348 // shut down port driver 349 if opt.PortDriver != nil { 350 portDriverQuit <- struct{}{} 351 err = <-portDriverErr 352 } 353 return err 354 } 355 356 func getSubIDRanges(u *user.User, subidSource SubidSource) ([]idtools.SubIDRange, []idtools.SubIDRange, error) { 357 uid, err := strconv.Atoi(u.Uid) 358 if err != nil { 359 return nil, nil, err 360 } 361 switch subidSource { 362 case SubidSourceStatic: 363 logrus.Debugf("subid-source: using the static source") 364 return idtools.GetSubIDRanges(uid, u.Username) 365 case SubidSourceDynamic: 366 logrus.Debugf("subid-source: using the dynamic source") 367 return dynidtools.GetSubIDRanges(uid, u.Username) 368 case "", SubidSourceAuto: 369 subuidRanges, subgidRanges, err := getSubIDRanges(u, SubidSourceDynamic) 370 if err == nil && len(subuidRanges) > 0 && len(subgidRanges) > 0 { 371 return subuidRanges, subgidRanges, nil 372 } 373 logrus.WithError(err).Debugf("failed to use subid source %q, falling back to %q", SubidSourceDynamic, SubidSourceStatic) 374 return getSubIDRanges(u, SubidSourceStatic) 375 default: 376 return nil, nil, fmt.Errorf("unknown subid source %q", subidSource) 377 } 378 } 379 380 func newugidmapArgs(subidSource SubidSource) ([]string, []string, error) { 381 u, err := user.Current() 382 if err != nil { 383 return nil, nil, err 384 } 385 subuidRanges, subgidRanges, err := getSubIDRanges(u, subidSource) 386 if err != nil { 387 return nil, nil, err 388 } 389 logrus.Debugf("subuid ranges=%v", subuidRanges) 390 logrus.Debugf("subgid ranges=%v", subgidRanges) 391 return newugidmapArgsFromSubIDRanges(u, subuidRanges, subgidRanges) 392 } 393 394 func newugidmapArgsFromSubIDRanges(u *user.User, subuidRanges, subgidRanges []idtools.SubIDRange) ([]string, []string, error) { 395 uidMap := []string{ 396 "0", 397 u.Uid, 398 "1", 399 } 400 gidMap := []string{ 401 "0", 402 u.Gid, 403 "1", 404 } 405 406 uidMapLast := 1 407 for _, f := range subuidRanges { 408 uidMap = append(uidMap, []string{ 409 strconv.Itoa(uidMapLast), 410 strconv.Itoa(f.Start), 411 strconv.Itoa(f.Length), 412 }...) 413 uidMapLast += f.Length 414 } 415 gidMapLast := 1 416 for _, f := range subgidRanges { 417 gidMap = append(gidMap, []string{ 418 strconv.Itoa(gidMapLast), 419 strconv.Itoa(f.Start), 420 strconv.Itoa(f.Length), 421 }...) 422 gidMapLast += f.Length 423 } 424 425 return uidMap, gidMap, nil 426 } 427 428 func setupUIDGIDMap(pid int, subidSource SubidSource) error { 429 uArgs, gArgs, err := newugidmapArgs(subidSource) 430 if err != nil { 431 return fmt.Errorf("failed to compute uid/gid map: %w", err) 432 } 433 pidS := strconv.Itoa(pid) 434 cmd := exec.Command("newuidmap", append([]string{pidS}, uArgs...)...) 435 out, err := cmd.CombinedOutput() 436 if err != nil { 437 return fmt.Errorf("newuidmap %s %v failed: %s: %w", pidS, uArgs, string(out), err) 438 } 439 cmd = exec.Command("newgidmap", append([]string{pidS}, gArgs...)...) 440 out, err = cmd.CombinedOutput() 441 if err != nil { 442 return fmt.Errorf("newgidmap %s %v failed: %s: %w", pidS, gArgs, string(out), err) 443 } 444 return nil 445 } 446 447 // apiCloser is implemented by *http.Server 448 type apiCloser interface { 449 Close() error 450 Shutdown(context.Context) error 451 } 452 453 func listenServeAPI(socketPath string, backend *router.Backend) (apiCloser, error) { 454 r := mux.NewRouter() 455 router.AddRoutes(r, backend) 456 srv := &http.Server{Handler: r} 457 err := os.RemoveAll(socketPath) 458 if err != nil { 459 return nil, err 460 } 461 l, err := net.Listen("unix", socketPath) 462 if err != nil { 463 return nil, err 464 } 465 go srv.Serve(l) 466 return srv, nil 467 } 468 469 // InitStateDir removes everything in the state dir except the lock file. 470 // This is needed because when the previous execution crashed, the state dir may not be removed successfully. 471 // 472 // InitStateDir must be called before calling parent functions. 473 func InitStateDir(stateDir string) error { 474 if err := os.MkdirAll(stateDir, 0755); err != nil { 475 return err 476 } 477 lk, err := LockStateDir(stateDir) 478 if err != nil { 479 return err 480 } 481 defer lk.Unlock() 482 stateDirStuffs, err := os.ReadDir(stateDir) 483 if err != nil { 484 return err 485 } 486 for _, f := range stateDirStuffs { 487 if f.Name() == StateFileLock { 488 continue 489 } 490 p := filepath.Join(stateDir, f.Name()) 491 if err := os.RemoveAll(p); err != nil { 492 return fmt.Errorf("failed to remove %s: %w", p, err) 493 } 494 } 495 return nil 496 }