github.com/rkt/rkt@v1.30.1-0.20200224141603-171c416fac02/stage1/init/init.go (about) 1 // Copyright 2014 The rkt Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 //+build linux 16 17 package main 18 19 // this implements /init of stage1/nspawn+systemd 20 21 import ( 22 "errors" 23 "flag" 24 "fmt" 25 "io" 26 "io/ioutil" 27 "net" 28 "os" 29 "os/exec" 30 "path/filepath" 31 "runtime" 32 "strings" 33 "syscall" 34 35 "github.com/appc/goaci/proj2aci" 36 "github.com/appc/spec/schema/types" 37 "github.com/coreos/go-systemd/util" 38 "github.com/coreos/pkg/dlopen" 39 "github.com/godbus/dbus" 40 "github.com/godbus/dbus/introspect" 41 "github.com/hashicorp/errwrap" 42 43 stage1common "github.com/rkt/rkt/stage1/common" 44 stage1commontypes "github.com/rkt/rkt/stage1/common/types" 45 stage1initcommon "github.com/rkt/rkt/stage1/init/common" 46 47 "github.com/rkt/rkt/common" 48 "github.com/rkt/rkt/common/cgroup" 49 "github.com/rkt/rkt/common/cgroup/v1" 50 "github.com/rkt/rkt/common/cgroup/v2" 51 commonnet "github.com/rkt/rkt/common/networking" 52 "github.com/rkt/rkt/networking" 53 pkgflag "github.com/rkt/rkt/pkg/flag" 54 "github.com/rkt/rkt/pkg/fs" 55 rktlog "github.com/rkt/rkt/pkg/log" 56 "github.com/rkt/rkt/pkg/sys" 57 "github.com/rkt/rkt/pkg/user" 58 "github.com/rkt/rkt/stage1/init/kvm" 59 "github.com/rkt/rkt/stage1/init/kvm/hypervisor/hvlkvm" 60 "github.com/rkt/rkt/stage1/init/kvm/hypervisor/hvqemu" 61 ) 62 63 const ( 64 // Path to systemd-nspawn binary within the stage1 rootfs 65 nspawnBin = "/usr/bin/systemd-nspawn" 66 // Path to the localtime file/symlink in host 67 localtimePath = "/etc/localtime" 68 ) 69 70 // mirrorLocalZoneInfo tries to reproduce the /etc/localtime target in stage1/ to satisfy systemd-nspawn 71 func mirrorLocalZoneInfo(root string) { 72 zif, err := os.Readlink(localtimePath) 73 if err != nil { 74 return 75 } 76 77 // On some systems /etc/localtime is a relative symlink, make it absolute 78 if !filepath.IsAbs(zif) { 79 zif = filepath.Join(filepath.Dir(localtimePath), zif) 80 zif = filepath.Clean(zif) 81 } 82 83 src, err := os.Open(zif) 84 if err != nil { 85 return 86 } 87 defer src.Close() 88 89 destp := filepath.Join(common.Stage1RootfsPath(root), zif) 90 91 if err = os.MkdirAll(filepath.Dir(destp), 0755); err != nil { 92 return 93 } 94 95 dest, err := os.OpenFile(destp, os.O_CREATE|os.O_WRONLY, 0644) 96 if err != nil { 97 return 98 } 99 defer dest.Close() 100 101 _, _ = io.Copy(dest, src) 102 } 103 104 var ( 105 debug bool 106 localhostIP net.IP 107 localConfig string 108 log *rktlog.Logger 109 diag *rktlog.Logger 110 interpBin string // Path to the interpreter within the stage1 rootfs, set by the linker 111 ) 112 113 func parseFlags() *stage1commontypes.RuntimePod { 114 rp := stage1commontypes.RuntimePod{} 115 116 flag.BoolVar(&debug, "debug", false, "Run in debug mode") 117 flag.StringVar(&localConfig, "local-config", common.DefaultLocalConfigDir, "Local config path") 118 119 // These flags are persisted in the PodRuntime 120 flag.BoolVar(&rp.Interactive, "interactive", false, "The pod is interactive") 121 flag.BoolVar(&rp.Mutable, "mutable", false, "Enable mutable operations on this pod, including starting an empty one") 122 flag.Var(&rp.NetList, "net", "Setup networking") 123 flag.StringVar(&rp.PrivateUsers, "private-users", "", "Run within user namespace. Can be set to [=UIDBASE[:NUIDS]]") 124 flag.StringVar(&rp.MDSToken, "mds-token", "", "MDS auth token") 125 flag.StringVar(&rp.Hostname, "hostname", "", "Hostname of the pod") 126 flag.BoolVar(&rp.InsecureOptions.DisableCapabilities, "disable-capabilities-restriction", false, "Disable capability restrictions") 127 flag.BoolVar(&rp.InsecureOptions.DisablePaths, "disable-paths", false, "Disable paths restrictions") 128 flag.BoolVar(&rp.InsecureOptions.DisableSeccomp, "disable-seccomp", false, "Disable seccomp restrictions") 129 dnsConfMode := pkgflag.MustNewPairList(map[string][]string{ 130 "resolv": {"host", "stage0", "none", "default"}, 131 "hosts": {"host", "stage0", "default"}, 132 }, map[string]string{ 133 "resolv": "default", 134 "hosts": "default", 135 }) 136 flag.Var(dnsConfMode, "dns-conf-mode", "DNS config file modes") 137 flag.StringVar(&rp.IPCMode, "ipc", "", "IPC mode --ipc=[auto|private|parent]") 138 139 flag.Parse() 140 141 rp.Debug = debug 142 rp.ResolvConfMode = dnsConfMode.Pairs["resolv"] 143 rp.EtcHostsMode = dnsConfMode.Pairs["hosts"] 144 145 return &rp 146 } 147 148 func init() { 149 // this ensures that main runs only on main thread (thread group leader). 150 // since namespace ops (unshare, setns) are done for a single thread, we 151 // must ensure that the goroutine does not jump from OS thread to thread 152 runtime.LockOSThread() 153 154 // We'll need this later 155 localhostIP = net.ParseIP("127.0.0.1") 156 if localhostIP == nil { 157 panic("localhost IP failed to parse") 158 } 159 } 160 161 // machinedRegister checks if nspawn should register the pod to machined 162 func machinedRegister() bool { 163 // machined has a D-Bus interface following versioning guidelines, see: 164 // http://www.freedesktop.org/wiki/Software/systemd/machined/ 165 // Therefore we can just check if the D-Bus method we need exists and we 166 // don't need to check the signature. 167 var found int 168 169 conn, err := dbus.SystemBus() 170 if err != nil { 171 return false 172 } 173 node, err := introspect.Call(conn.Object("org.freedesktop.machine1", "/org/freedesktop/machine1")) 174 if err != nil { 175 return false 176 } 177 for _, iface := range node.Interfaces { 178 if iface.Name != "org.freedesktop.machine1.Manager" { 179 continue 180 } 181 // machined v215 supports methods "RegisterMachine" and "CreateMachine" called by nspawn v215. 182 // machined v216+ (since commit 5aa4bb) additionally supports methods "CreateMachineWithNetwork" 183 // and "RegisterMachineWithNetwork", called by nspawn v216+. 184 for _, method := range iface.Methods { 185 if method.Name == "CreateMachineWithNetwork" || method.Name == "RegisterMachineWithNetwork" { 186 found++ 187 } 188 } 189 break 190 } 191 return found == 2 192 } 193 194 func installAssets(systemdVersion int) error { 195 systemctlBin, err := common.LookupPath("systemctl", os.Getenv("PATH")) 196 if err != nil { 197 return err 198 } 199 systemdSysusersBin, err := common.LookupPath("systemd-sysusers", os.Getenv("PATH")) 200 if err != nil { 201 return err 202 } 203 systemdTmpfilesBin, err := common.LookupPath("systemd-tmpfiles", os.Getenv("PATH")) 204 if err != nil { 205 return err 206 } 207 bashBin, err := common.LookupPath("bash", os.Getenv("PATH")) 208 if err != nil { 209 return err 210 } 211 mountBin, err := common.LookupPath("mount", os.Getenv("PATH")) 212 if err != nil { 213 return err 214 } 215 umountBin, err := common.LookupPath("umount", os.Getenv("PATH")) 216 if err != nil { 217 return err 218 } 219 // More paths could be added in that list if some Linux distributions install it in a different path 220 // Note that we look in /usr/lib/... first because of the merge: 221 // http://www.freedesktop.org/wiki/Software/systemd/TheCaseForTheUsrMerge/ 222 systemdShutdownBin, err := common.LookupPath("systemd-shutdown", "/usr/lib/systemd:/lib/systemd") 223 if err != nil { 224 return err 225 } 226 systemdBin, err := common.LookupPath("systemd", "/usr/lib/systemd:/lib/systemd") 227 if err != nil { 228 return err 229 } 230 systemdJournaldBin, err := common.LookupPath("systemd-journald", "/usr/lib/systemd:/lib/systemd") 231 if err != nil { 232 return err 233 } 234 235 systemdUnitsPath := "/lib/systemd/system" 236 assets := []string{ 237 proj2aci.GetAssetString("/usr/lib/systemd/systemd", systemdBin), 238 proj2aci.GetAssetString("/usr/bin/systemctl", systemctlBin), 239 proj2aci.GetAssetString("/usr/bin/systemd-sysusers", systemdSysusersBin), 240 proj2aci.GetAssetString("/usr/bin/systemd-tmpfiles", systemdTmpfilesBin), 241 proj2aci.GetAssetString("/usr/lib/systemd/systemd-journald", systemdJournaldBin), 242 proj2aci.GetAssetString("/usr/bin/bash", bashBin), 243 proj2aci.GetAssetString("/bin/mount", mountBin), 244 proj2aci.GetAssetString("/bin/umount", umountBin), 245 proj2aci.GetAssetString(fmt.Sprintf("%s/systemd-journald.service", systemdUnitsPath), fmt.Sprintf("%s/systemd-journald.service", systemdUnitsPath)), 246 proj2aci.GetAssetString(fmt.Sprintf("%s/systemd-journald.socket", systemdUnitsPath), fmt.Sprintf("%s/systemd-journald.socket", systemdUnitsPath)), 247 proj2aci.GetAssetString(fmt.Sprintf("%s/systemd-journald-dev-log.socket", systemdUnitsPath), fmt.Sprintf("%s/systemd-journald-dev-log.socket", systemdUnitsPath)), 248 proj2aci.GetAssetString(fmt.Sprintf("%s/systemd-journald-audit.socket", systemdUnitsPath), fmt.Sprintf("%s/systemd-journald-audit.socket", systemdUnitsPath)), 249 // systemd-shutdown has to be installed at the same path as on the host 250 // because it depends on systemd build flag -DSYSTEMD_SHUTDOWN_BINARY_PATH= 251 proj2aci.GetAssetString(systemdShutdownBin, systemdShutdownBin), 252 } 253 254 // systemd-journal-flush.service was added in systemd-v233. Required to place 255 // the logs in /var/log/journal instead of /run/log/journal. See: 256 // https://github.com/systemd/systemd/commit/f78273c8dacf678cc8fd7387f678e6344a99405c 257 if systemdVersion >= 233 { 258 assets = append(assets, proj2aci.GetAssetString(fmt.Sprintf("%s/systemd-journal-flush.service", systemdUnitsPath), fmt.Sprintf("%s/systemd-journald.service", systemdUnitsPath))) 259 } 260 261 return proj2aci.PrepareAssets(assets, "./stage1/rootfs/", nil) 262 } 263 264 // getArgsEnv returns the nspawn or lkvm args and env according to the flavor 265 // as the first two return values respectively. 266 func getArgsEnv(p *stage1commontypes.Pod, flavor string, canMachinedRegister bool, debug bool, n *networking.Networking, parentIPC bool) ([]string, []string, error) { 267 var args []string 268 env := os.Environ() 269 270 // We store the pod's flavor so we can later garbage collect it correctly 271 if err := os.Symlink(flavor, filepath.Join(p.Root, stage1initcommon.FlavorFile)); err != nil { 272 return nil, nil, errwrap.Wrap(errors.New("failed to create flavor symlink"), err) 273 } 274 275 // systemd-nspawn needs /etc/machine-id to link the container's journal 276 // to the host. Since systemd-v230, /etc/machine-id is mandatory, see 277 // https://github.com/systemd/systemd/commit/e01ff70a77e781734e1e73a2238af2e9bf7967a8 278 mPath := filepath.Join(common.Stage1RootfsPath(p.Root), "etc", "machine-id") 279 machineID := strings.Replace(p.UUID.String(), "-", "", -1) 280 281 switch flavor { 282 case "kvm": 283 if p.PrivateUsers != "" { 284 return nil, nil, fmt.Errorf("flag --private-users cannot be used with an lkvm stage1") 285 } 286 287 // kernel and hypervisor binaries are located relative to the working directory 288 // of init (/var/lib/rkt/..../uuid) 289 // TODO: move to path.go 290 kernelPath := filepath.Join(common.Stage1RootfsPath(p.Root), "kernel_image") 291 netDescriptions := kvm.GetNetworkDescriptions(n) 292 293 cpu, mem := kvm.GetAppsResources(p.Manifest.Apps) 294 295 // Parse hypervisor 296 hv, err := KvmCheckHypervisor(common.Stage1RootfsPath(p.Root)) 297 if err != nil { 298 return nil, nil, err 299 } 300 301 // Set start command for hypervisor 302 StartCmd := hvlkvm.StartCmd 303 switch hv { 304 case "lkvm": 305 StartCmd = hvlkvm.StartCmd 306 case "qemu": 307 StartCmd = hvqemu.StartCmd 308 default: 309 return nil, nil, fmt.Errorf("unrecognized hypervisor") 310 } 311 312 hvStartCmd := StartCmd( 313 common.Stage1RootfsPath(p.Root), 314 p.UUID.String(), 315 kernelPath, 316 netDescriptions, 317 cpu, 318 mem, 319 debug, 320 ) 321 322 if hvStartCmd == nil { 323 return nil, nil, fmt.Errorf("no hypervisor") 324 } 325 326 args = append(args, hvStartCmd...) 327 328 // lkvm requires $HOME to be defined, 329 // see https://github.com/rkt/rkt/issues/1393 330 if os.Getenv("HOME") == "" { 331 env = append(env, "HOME=/root") 332 } 333 334 if err := linkJournal(common.Stage1RootfsPath(p.Root), machineID); err != nil { 335 return nil, nil, errwrap.Wrap(errors.New("error linking pod's journal"), err) 336 } 337 338 // use only dynamic libraries provided in the image 339 // from systemd v231 there's a new internal libsystemd-shared-v231.so 340 // which is present in /usr/lib/systemd 341 env = append(env, "LD_LIBRARY_PATH="+filepath.Join(common.Stage1RootfsPath(p.Root), "usr/lib/systemd")) 342 343 return args, env, nil 344 345 case "coreos": 346 args = append(args, filepath.Join(common.Stage1RootfsPath(p.Root), interpBin)) 347 args = append(args, filepath.Join(common.Stage1RootfsPath(p.Root), nspawnBin)) 348 args = append(args, "--boot") // Launch systemd in the pod 349 args = append(args, "--notify-ready=yes") // From systemd v231 350 351 if context := os.Getenv(common.EnvSELinuxContext); context != "" { 352 args = append(args, fmt.Sprintf("-Z%s", context)) 353 } 354 355 if context := os.Getenv(common.EnvSELinuxMountContext); context != "" { 356 args = append(args, fmt.Sprintf("-L%s", context)) 357 } 358 359 if canMachinedRegister { 360 args = append(args, fmt.Sprintf("--register=true")) 361 } else { 362 args = append(args, fmt.Sprintf("--register=false")) 363 } 364 365 kubernetesLogDir, ok := p.Manifest.Annotations.Get("coreos.com/rkt/experiment/kubernetes-log-dir") 366 if ok { 367 args = append(args, fmt.Sprintf("--bind=%s:/rkt/kubernetes/log", kubernetesLogDir)) 368 } 369 370 // use only dynamic libraries provided in the image 371 // from systemd v231 there's a new internal libsystemd-shared-v231.so 372 // which is present in /usr/lib/systemd 373 env = append(env, "LD_LIBRARY_PATH="+ 374 filepath.Join(common.Stage1RootfsPath(p.Root), "usr/lib")+":"+ 375 filepath.Join(common.Stage1RootfsPath(p.Root), "usr/lib/systemd")) 376 377 case "src": 378 args = append(args, filepath.Join(common.Stage1RootfsPath(p.Root), interpBin)) 379 args = append(args, filepath.Join(common.Stage1RootfsPath(p.Root), nspawnBin)) 380 args = append(args, "--boot") // Launch systemd in the pod 381 args = append(args, "--notify-ready=yes") // From systemd v231 382 383 if context := os.Getenv(common.EnvSELinuxContext); context != "" { 384 args = append(args, fmt.Sprintf("-Z%s", context)) 385 } 386 387 if context := os.Getenv(common.EnvSELinuxMountContext); context != "" { 388 args = append(args, fmt.Sprintf("-L%s", context)) 389 } 390 391 if canMachinedRegister { 392 args = append(args, fmt.Sprintf("--register=true")) 393 } else { 394 args = append(args, fmt.Sprintf("--register=false")) 395 } 396 397 kubernetesLogDir, ok := p.Manifest.Annotations.Get("coreos.com/rkt/experiment/kubernetes-log-dir") 398 if ok { 399 args = append(args, fmt.Sprintf("--bind=%s:/rkt/kubernetes/log", kubernetesLogDir)) 400 } 401 402 // use only dynamic libraries provided in the image 403 // from systemd v231 there's a new internal libsystemd-shared-v231.so 404 // which is present in /usr/lib/systemd 405 env = append(env, "LD_LIBRARY_PATH="+ 406 filepath.Join(common.Stage1RootfsPath(p.Root), "usr/lib")+":"+ 407 filepath.Join(common.Stage1RootfsPath(p.Root), "usr/lib/systemd")) 408 409 case "host": 410 hostNspawnBin, err := common.LookupPath("systemd-nspawn", os.Getenv("PATH")) 411 if err != nil { 412 return nil, nil, err 413 } 414 415 // Check dynamically which version is installed on the host 416 // Support version >= 220 417 versionBytes, err := exec.Command(hostNspawnBin, "--version").CombinedOutput() 418 if err != nil { 419 return nil, nil, errwrap.Wrap(fmt.Errorf("unable to probe %s version", hostNspawnBin), err) 420 } 421 versionStr := strings.SplitN(string(versionBytes), "\n", 2)[0] 422 var version int 423 n, err := fmt.Sscanf(versionStr, "systemd %d", &version) 424 if err != nil { 425 return nil, nil, fmt.Errorf("cannot parse version: %q", versionStr) 426 } 427 if n != 1 || version < 220 { 428 return nil, nil, fmt.Errorf("rkt needs systemd-nspawn >= 220. %s version not supported: %v", hostNspawnBin, versionStr) 429 } 430 431 // Copy systemd, bash, etc. in stage1 at run-time 432 if err := installAssets(version); err != nil { 433 return nil, nil, errwrap.Wrap(errors.New("cannot install assets from the host"), err) 434 } 435 436 args = append(args, hostNspawnBin) 437 args = append(args, "--boot") // Launch systemd in the pod 438 args = append(args, fmt.Sprintf("--register=true")) 439 440 if version >= 231 { 441 args = append(args, "--notify-ready=yes") // From systemd v231 442 } 443 444 if context := os.Getenv(common.EnvSELinuxContext); context != "" { 445 args = append(args, fmt.Sprintf("-Z%s", context)) 446 } 447 448 if context := os.Getenv(common.EnvSELinuxMountContext); context != "" { 449 args = append(args, fmt.Sprintf("-L%s", context)) 450 } 451 452 kubernetesLogDir, ok := p.Manifest.Annotations.Get("coreos.com/rkt/experiment/kubernetes-log-dir") 453 if ok { 454 args = append(args, fmt.Sprintf("--bind=%s:/rkt/kubernetes/log", kubernetesLogDir)) 455 } 456 457 default: 458 return nil, nil, fmt.Errorf("unrecognized stage1 flavor: %q", flavor) 459 } 460 461 machineIDBytes := append([]byte(machineID), '\n') 462 if err := ioutil.WriteFile(mPath, machineIDBytes, 0644); err != nil { 463 return nil, nil, errwrap.Wrap(errors.New("error writing /etc/machine-id"), err) 464 } 465 if err := user.ShiftFiles([]string{mPath}, &p.UidRange); err != nil { 466 return nil, nil, errwrap.Wrap(errors.New("error shifting /etc/machine-id"), err) 467 } 468 469 // link journal only if the host is running systemd 470 if util.IsRunningSystemd() { 471 args = append(args, "--link-journal=try-guest") 472 473 keepUnit, err := util.RunningFromSystemService() 474 if err != nil { 475 if err == dlopen.ErrSoNotFound { 476 log.Print("warning: libsystemd not found even though systemd is running. Cgroup limits set by the environment (e.g. a systemd service) won't be enforced.") 477 } else { 478 return nil, nil, errwrap.Wrap(errors.New("error determining if we're running from a system service"), err) 479 } 480 } 481 482 if keepUnit { 483 args = append(args, "--keep-unit") 484 } 485 } else { 486 args = append(args, "--link-journal=no") 487 } 488 489 if !debug { 490 args = append(args, "--quiet") // silence most nspawn output (log_warning is currently not covered by this) 491 env = append(env, "SYSTEMD_LOG_LEVEL=err") // silence log_warning too 492 } 493 494 if parentIPC { 495 env = append(env, "SYSTEMD_NSPAWN_SHARE_NS_IPC=true") 496 } 497 498 env = append(env, "SYSTEMD_NSPAWN_CONTAINER_SERVICE=rkt") 499 // TODO (alepuccetti) remove this line when rkt will use cgroup namespace 500 // If the kernel has the cgroup namespace enabled, systemd v232 will use it by default. 501 // This was introduced by https://github.com/systemd/systemd/pull/3809 and it will cause 502 // problems in rkt when cgns is enabled and cgroup-v1 is used. For more information see 503 // https://github.com/systemd/systemd/pull/3589#discussion_r70277625. 504 // The following line tells systemd-nspawn not to use cgroup namespace using the environment variable 505 // introduced by https://github.com/systemd/systemd/pull/3809. 506 env = append(env, "SYSTEMD_NSPAWN_USE_CGNS=no") 507 508 if p.InsecureOptions.DisablePaths { 509 env = append(env, "SYSTEMD_NSPAWN_API_VFS_WRITABLE=yes") 510 } 511 512 if len(p.PrivateUsers) > 0 { 513 args = append(args, "--private-users="+p.PrivateUsers) 514 } 515 516 nsargs, err := stage1initcommon.PodToNspawnArgs(p) 517 if err != nil { 518 return nil, nil, errwrap.Wrap(errors.New("failed to generate nspawn args"), err) 519 } 520 args = append(args, nsargs...) 521 522 // Arguments to systemd 523 args = append(args, "--") 524 args = append(args, "--default-standard-output=tty") // redirect all service logs straight to tty 525 if !debug { 526 args = append(args, "--log-target=null") // silence systemd output inside pod 527 args = append(args, "--show-status=0") // silence systemd initialization status output 528 } 529 530 return args, env, nil 531 } 532 533 func stage1(rp *stage1commontypes.RuntimePod) int { 534 uuid, err := types.NewUUID(flag.Arg(0)) 535 if err != nil { 536 log.FatalE("UUID is missing or malformed", err) 537 } 538 539 root := "." 540 p, err := stage1commontypes.LoadPod(root, uuid, rp) 541 if err != nil { 542 log.FatalE("failed to load pod", err) 543 } 544 545 if err := p.SaveRuntime(); err != nil { 546 log.FatalE("failed to save runtime parameters", err) 547 } 548 549 // set close-on-exec flag on RKT_LOCK_FD so it gets correctly closed when invoking 550 // network plugins 551 lfd, err := common.GetRktLockFD() 552 if err != nil { 553 log.FatalE("failed to get rkt lock fd", err) 554 } 555 556 if err := sys.CloseOnExec(lfd, true); err != nil { 557 log.FatalE("failed to set FD_CLOEXEC on rkt lock", err) 558 } 559 560 mirrorLocalZoneInfo(p.Root) 561 562 flavor, _, err := stage1initcommon.GetFlavor(p) 563 if err != nil { 564 log.FatalE("failed to get stage1 flavor", err) 565 } 566 567 var n *networking.Networking 568 if p.NetList.Contained() { 569 fps, err := commonnet.ForwardedPorts(p.Manifest) 570 if err != nil { 571 log.FatalE("error initializing forwarding ports", err) 572 } 573 574 noDNS := p.ResolvConfMode != "default" // force ignore CNI DNS results 575 n, err = networking.Setup(root, p.UUID, fps, p.NetList, localConfig, flavor, noDNS, debug) 576 if err != nil { 577 log.FatalE("failed to setup network", err) 578 } 579 580 if err = n.Save(); err != nil { 581 log.PrintE("failed to save networking state", err) 582 n.Teardown(flavor, debug) 583 return 254 584 } 585 586 if len(p.MDSToken) > 0 { 587 hostIP, err := n.GetForwardableNetHostIP() 588 if err != nil { 589 log.FatalE("failed to get default Host IP", err) 590 } 591 592 p.MetadataServiceURL = common.MetadataServicePublicURL(hostIP, p.MDSToken) 593 } 594 } else { 595 if flavor == "kvm" { 596 log.Fatal("flavor kvm requires private network configuration (try --net)") 597 } 598 if len(p.MDSToken) > 0 { 599 p.MetadataServiceURL = common.MetadataServicePublicURL(localhostIP, p.MDSToken) 600 } 601 } 602 603 mnt := fs.NewLoggingMounter( 604 fs.MounterFunc(syscall.Mount), 605 fs.UnmounterFunc(syscall.Unmount), 606 diag.Printf, 607 ) 608 609 // set hostname inside pod 610 // According to systemd manual (https://www.freedesktop.org/software/systemd/man/hostname.html) : 611 // "The /etc/hostname file configures the name of the local system that is set 612 // during boot using the sethostname system call" 613 if p.Hostname == "" { 614 p.Hostname = stage1initcommon.GetMachineID(p) 615 } 616 hostnamePath := filepath.Join(common.Stage1RootfsPath(p.Root), "etc/hostname") 617 if err := ioutil.WriteFile(hostnamePath, []byte(p.Hostname), 0644); err != nil { 618 log.PrintE("error writing "+hostnamePath, err) 619 return 254 620 } 621 if err := user.ShiftFiles([]string{hostnamePath}, &p.UidRange); err != nil { 622 log.PrintE("error shifting "+hostnamePath, err) 623 } 624 625 if p.ResolvConfMode == "host" { 626 stage1initcommon.UseHostResolv(mnt, root) 627 } 628 629 // Set up the hosts file. 630 // We write <stage1>/etc/rkt-hosts if we want to override each app's hosts, 631 // and <stage1>/etc/hosts-fallback if we want to let the app "win" 632 // Either way, we should add our hostname to it, unless the hosts's 633 // /etc/hosts is bind-mounted in. 634 if p.EtcHostsMode == "host" { // We should bind-mount the hosts's /etc/hosts 635 stage1initcommon.UseHostHosts(mnt, root) 636 } else if p.EtcHostsMode == "default" { // Create hosts-fallback 637 hostsFile := filepath.Join(common.Stage1RootfsPath(p.Root), "etc", "hosts-fallback") 638 if err := stage1initcommon.AddHostsEntry(hostsFile, "127.0.0.1", p.Hostname); err != nil { 639 log.PrintE("Failed to write hostname to "+hostsFile, err) 640 return 254 641 } 642 } else if p.EtcHostsMode == "stage0" { // The stage0 has created rkt-hosts 643 hostsFile := filepath.Join(common.Stage1RootfsPath(p.Root), "etc", "rkt-hosts") 644 if err := stage1initcommon.AddHostsEntry(hostsFile, "127.0.0.1", p.Hostname); err != nil { 645 log.PrintE("Failed to write hostname to "+hostsFile, err) 646 return 254 647 } 648 } 649 650 if p.Mutable { 651 if err = stage1initcommon.MutableEnv(p); err != nil { 652 log.FatalE("cannot initialize mutable environment", err) 653 } 654 } else { 655 if err = stage1initcommon.ImmutableEnv(p); err != nil { 656 log.FatalE("cannot initialize immutable environment", err) 657 } 658 } 659 660 if err := stage1initcommon.SetJournalPermissions(p); err != nil { 661 log.PrintE("warning: error setting journal ACLs, you'll need root to read the pod journal", err) 662 } 663 664 if flavor == "kvm" { 665 kvm.InitDebug(debug) 666 if err := KvmNetworkingToSystemd(p, n); err != nil { 667 log.FatalE("failed to configure systemd for kvm", err) 668 } 669 } 670 671 canMachinedRegister := false 672 if flavor != "kvm" { 673 // kvm doesn't register with systemd right now, see #2664. 674 canMachinedRegister = machinedRegister() 675 } 676 diag.Printf("canMachinedRegister %t", canMachinedRegister) 677 678 // --ipc=[auto|private|parent] 679 // default to private 680 parentIPC := false 681 switch p.IPCMode { 682 case "parent": 683 parentIPC = true 684 case "private": 685 parentIPC = false 686 case "auto": 687 fallthrough 688 case "": 689 parentIPC = false 690 default: 691 log.Fatalf("unknown value for --ipc parameter: %v", p.IPCMode) 692 } 693 if parentIPC && flavor == "kvm" { 694 log.Fatal("flavor kvm requires private IPC namespace (try to remove --ipc)") 695 } 696 697 args, env, err := getArgsEnv(p, flavor, canMachinedRegister, debug, n, parentIPC) 698 if err != nil { 699 log.FatalE("cannot get environment", err) 700 } 701 diag.Printf("args %q", args) 702 diag.Printf("env %q", env) 703 704 // create a separate mount namespace so the cgroup filesystems 705 // are unmounted when exiting the pod 706 if err := syscall.Unshare(syscall.CLONE_NEWNS); err != nil { 707 log.FatalE("error unsharing", err) 708 } 709 710 // we recursively make / a "shared and slave" so mount events from the 711 // new namespace don't propagate to the host namespace but mount events 712 // from the host propagate to the new namespace and are forwarded to 713 // its peer group 714 // See https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt 715 if err := mnt.Mount("", "/", "none", syscall.MS_REC|syscall.MS_SLAVE, ""); err != nil { 716 log.FatalE("error making / a slave mount", err) 717 } 718 if err := mnt.Mount("", "/", "none", syscall.MS_REC|syscall.MS_SHARED, ""); err != nil { 719 log.FatalE("error making / a shared and slave mount", err) 720 } 721 722 unifiedCgroup, err := cgroup.IsCgroupUnified("/") 723 if err != nil { 724 log.FatalE("error determining cgroup version", err) 725 } 726 diag.Printf("unifiedCgroup %t", unifiedCgroup) 727 728 machineID := stage1initcommon.GetMachineID(p) 729 730 subcgroup, err := getContainerSubCgroup(machineID, canMachinedRegister, unifiedCgroup) 731 if err != nil { 732 log.FatalE("error getting container subcgroup", err) 733 } 734 diag.Printf("subcgroup %q", subcgroup) 735 736 if err := ioutil.WriteFile(filepath.Join(p.Root, "subcgroup"), 737 []byte(fmt.Sprintf("%s", subcgroup)), 0644); err != nil { 738 log.FatalE("cannot write subcgroup file", err) 739 } 740 741 if !unifiedCgroup { 742 enabledCgroups, err := v1.GetEnabledCgroups() 743 if err != nil { 744 log.FatalE("error getting v1 cgroups", err) 745 } 746 diag.Printf("enabledCgroups %q", enabledCgroups) 747 748 if err := mountHostV1Cgroups(mnt, enabledCgroups); err != nil { 749 log.FatalE("couldn't mount the host v1 cgroups", err) 750 } 751 752 if !canMachinedRegister { 753 if err := v1.JoinSubcgroup("systemd", subcgroup); err != nil { 754 log.FatalE(fmt.Sprintf("error joining subcgroup %q", subcgroup), err) 755 } 756 } 757 758 var serviceNames []string 759 for _, app := range p.Manifest.Apps { 760 serviceNames = append(serviceNames, stage1initcommon.ServiceUnitName(app.Name)) 761 } 762 diag.Printf("serviceNames %q", serviceNames) 763 764 if err := mountContainerV1Cgroups(mnt, p, enabledCgroups, subcgroup, serviceNames); err != nil { 765 log.FatalE("couldn't mount the container v1 cgroups", err) 766 } 767 768 } 769 770 // KVM flavor has a bit different logic in handling pid vs ppid, for details look into #2389 771 // it doesn't require the existence of a "ppid", instead it registers the current pid (which 772 // will be reused by lkvm binary) as a pod process pid used during entering 773 pid_filename := "ppid" 774 if flavor == "kvm" { 775 pid_filename = "pid" 776 } 777 778 if err = stage1common.WritePid(os.Getpid(), pid_filename); err != nil { 779 log.FatalE("error writing pid", err) 780 } 781 782 if flavor == "kvm" { 783 if err := KvmPrepareMounts(p); err != nil { 784 log.FatalE("error preparing mounts", err) 785 } 786 } 787 788 err = stage1common.WithClearedCloExec(lfd, func() error { 789 return syscall.Exec(args[0], args, env) 790 }) 791 792 if err != nil { 793 log.FatalE(fmt.Sprintf("failed to execute %q", args[0]), err) 794 } 795 796 return 0 797 } 798 799 func areHostV1CgroupsMounted(enabledV1Cgroups map[int][]string) bool { 800 controllers := v1.GetControllerDirs(enabledV1Cgroups) 801 for _, c := range controllers { 802 if mounted, _ := v1.IsControllerMounted(c); !mounted { 803 return false 804 } 805 } 806 807 return true 808 } 809 810 // mountHostV1Cgroups mounts the host v1 cgroup hierarchy as required by 811 // systemd-nspawn. We need this because some distributions don't have the 812 // "name=systemd" cgroup or don't mount the cgroup controllers in 813 // "/sys/fs/cgroup", and systemd-nspawn needs this. Since this is mounted 814 // inside the rkt mount namespace, it doesn't affect the host. 815 func mountHostV1Cgroups(m fs.Mounter, enabledCgroups map[int][]string) error { 816 systemdControllerPath := "/sys/fs/cgroup/systemd" 817 if !areHostV1CgroupsMounted(enabledCgroups) { 818 mountContext := os.Getenv(common.EnvSELinuxMountContext) 819 if err := v1.CreateCgroups(m, "/", enabledCgroups, mountContext); err != nil { 820 return errwrap.Wrap(errors.New("error creating host cgroups"), err) 821 } 822 } 823 824 mounted, err := v1.IsControllerMounted("systemd") 825 if err != nil { 826 return err 827 } 828 if !mounted { 829 if err := os.MkdirAll(systemdControllerPath, 0700); err != nil { 830 return err 831 } 832 if err := m.Mount("cgroup", systemdControllerPath, "cgroup", 0, "none,name=systemd"); err != nil { 833 return errwrap.Wrap(fmt.Errorf("error mounting name=systemd hierarchy on %q", systemdControllerPath), err) 834 } 835 } 836 837 return nil 838 } 839 840 // mountContainerV1Cgroups mounts the cgroup controllers hierarchy in the container's 841 // namespace read-only, leaving the needed knobs in the subcgroup for each-app 842 // read-write so systemd inside stage1 can apply isolators to them 843 func mountContainerV1Cgroups(m fs.Mounter, p *stage1commontypes.Pod, enabledCgroups map[int][]string, subcgroup string, serviceNames []string) error { 844 mountContext := os.Getenv(common.EnvSELinuxMountContext) 845 stage1Root := common.Stage1RootfsPath(p.Root) 846 if err := v1.CreateCgroups(m, stage1Root, enabledCgroups, mountContext); err != nil { 847 return errwrap.Wrap(errors.New("error creating container cgroups"), err) 848 } 849 850 if err := v1.RemountCgroups(m, stage1Root, enabledCgroups, subcgroup, p.InsecureOptions.DisablePaths); err != nil { 851 return errwrap.Wrap(errors.New("error restricting container cgroups"), err) 852 } 853 854 return nil 855 } 856 857 func getContainerSubCgroup(machineID string, canMachinedRegister, unified bool) (string, error) { 858 var fromUnit bool 859 860 if util.IsRunningSystemd() { 861 var err error 862 if fromUnit, err = util.RunningFromSystemService(); err != nil { 863 return "", errwrap.Wrap(errors.New("could not determine if we're running from a unit file"), err) 864 } 865 } 866 867 if fromUnit { 868 slice, err := util.GetRunningSlice() 869 if err != nil { 870 return "", errwrap.Wrap(errors.New("could not get slice name"), err) 871 } 872 slicePath, err := common.SliceToPath(slice) 873 if err != nil { 874 return "", errwrap.Wrap(errors.New("could not convert slice name to path"), err) 875 } 876 unit, err := util.CurrentUnitName() 877 if err != nil { 878 return "", errwrap.Wrap(errors.New("could not get unit name"), err) 879 } 880 subcgroup := filepath.Join(slicePath, unit) 881 882 if unified { 883 return filepath.Join(subcgroup, "payload"), nil 884 } 885 886 return subcgroup, nil 887 } 888 889 escapedmID := strings.Replace(machineID, "-", "\\x2d", -1) 890 machineDir := "machine-" + escapedmID + ".scope" 891 892 if canMachinedRegister { 893 // we are not in the final cgroup yet: systemd-nspawn will move us 894 // to the correct cgroup later during registration so we can't 895 // look it up in /proc/self/cgroup 896 return filepath.Join("machine.slice", machineDir), nil 897 } 898 899 if unified { 900 subcgroup, err := v2.GetOwnCgroupPath() 901 if err != nil { 902 return "", errwrap.Wrap(errors.New("could not get own v2 cgroup path"), err) 903 } 904 return subcgroup, nil 905 } 906 907 // when registration is disabled the container will be directly 908 // under the current cgroup so we can look it up in /proc/self/cgroup 909 // Try the systemd slice first, falling back to cpu if that fails (e.g. on 910 // systems not running systemd). See issue #3502. 911 ownV1CgroupPath, err := v1.GetOwnCgroupPath("name=systemd") 912 if err != nil { 913 ownV1CgroupPath, err = v1.GetOwnCgroupPath("cpu") 914 if err != nil { 915 return "", errwrap.Wrap(errors.New("could not get own v1 cgroup path"), err) 916 } 917 } 918 919 // systemd-nspawn won't work if we are in the root cgroup. In addition, 920 // we want all rkt instances to be in distinct cgroups. Create a 921 // subcgroup and add ourselves to it. 922 return filepath.Join(ownV1CgroupPath, machineDir), nil 923 } 924 925 func main() { 926 rp := parseFlags() 927 stage1initcommon.InitDebug(debug) 928 929 log, diag, _ = rktlog.NewLogSet("stage1", debug) 930 if !debug { 931 diag.SetOutput(ioutil.Discard) 932 } 933 934 // move code into stage1() helper so deferred fns get run 935 os.Exit(stage1(rp)) 936 }