github.com/stackdocker/rkt@v0.10.1-0.20151109095037-1aa827478248/stage1/init/init.go (about) 1 // Copyright 2014 The rkt Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 //+build linux 16 17 package main 18 19 // #cgo LDFLAGS: -ldl 20 // #include <stdlib.h> 21 // #include <dlfcn.h> 22 // #include <sys/types.h> 23 // #include <unistd.h> 24 // 25 // int 26 // my_sd_pid_get_owner_uid(void *f, pid_t pid, uid_t *uid) 27 // { 28 // int (*sd_pid_get_owner_uid)(pid_t, uid_t *); 29 // 30 // sd_pid_get_owner_uid = (int (*)(pid_t, uid_t *))f; 31 // return sd_pid_get_owner_uid(pid, uid); 32 // } 33 // 34 // int 35 // my_sd_pid_get_unit(void *f, pid_t pid, char **unit) 36 // { 37 // int (*sd_pid_get_unit)(pid_t, char **); 38 // 39 // sd_pid_get_unit = (int (*)(pid_t, char **))f; 40 // return sd_pid_get_unit(pid, unit); 41 // } 42 // 43 // int 44 // my_sd_pid_get_slice(void *f, pid_t pid, char **slice) 45 // { 46 // int (*sd_pid_get_slice)(pid_t, char **); 47 // 48 // sd_pid_get_slice = (int (*)(pid_t, char **))f; 49 // return sd_pid_get_slice(pid, slice); 50 // } 51 // 52 // int 53 // am_session_leader() 54 // { 55 // return (getsid(0) == getpid()); 56 // } 57 import "C" 58 59 // this implements /init of stage1/nspawn+systemd 60 61 import ( 62 "flag" 63 "fmt" 64 "io" 65 "io/ioutil" 66 "log" 67 "net" 68 "os" 69 "os/exec" 70 "path/filepath" 71 "runtime" 72 "strconv" 73 "strings" 74 "syscall" 75 "unsafe" 76 77 "github.com/coreos/rkt/Godeps/_workspace/src/github.com/appc/goaci/proj2aci" 78 "github.com/coreos/rkt/Godeps/_workspace/src/github.com/appc/spec/schema/types" 79 "github.com/coreos/rkt/Godeps/_workspace/src/github.com/coreos/go-systemd/util" 80 "github.com/coreos/rkt/Godeps/_workspace/src/github.com/godbus/dbus" 81 "github.com/coreos/rkt/Godeps/_workspace/src/github.com/godbus/dbus/introspect" 82 83 "github.com/coreos/rkt/common" 84 "github.com/coreos/rkt/common/cgroup" 85 "github.com/coreos/rkt/networking" 86 "github.com/coreos/rkt/pkg/sys" 87 "github.com/coreos/rkt/stage1/init/kvm" 88 ) 89 90 const ( 91 // Path to systemd-nspawn binary within the stage1 rootfs 92 nspawnBin = "/usr/bin/systemd-nspawn" 93 // Path to the interpreter within the stage1 rootfs 94 interpBin = "/usr/lib/ld-linux-x86-64.so.2" 95 // Path to the localtime file/symlink in host 96 localtimePath = "/etc/localtime" 97 ) 98 99 // mirrorLocalZoneInfo tries to reproduce the /etc/localtime target in stage1/ to satisfy systemd-nspawn 100 func mirrorLocalZoneInfo(root string) { 101 zif, err := os.Readlink(localtimePath) 102 if err != nil { 103 return 104 } 105 106 // On some systems /etc/localtime is a relative symlink, make it absolute 107 if !filepath.IsAbs(zif) { 108 zif = filepath.Join(filepath.Dir(localtimePath), zif) 109 zif = filepath.Clean(zif) 110 } 111 112 src, err := os.Open(zif) 113 if err != nil { 114 return 115 } 116 defer src.Close() 117 118 destp := filepath.Join(common.Stage1RootfsPath(root), zif) 119 120 if err = os.MkdirAll(filepath.Dir(destp), 0755); err != nil { 121 return 122 } 123 124 dest, err := os.OpenFile(destp, os.O_CREATE|os.O_WRONLY, 0644) 125 if err != nil { 126 return 127 } 128 defer dest.Close() 129 130 _, _ = io.Copy(dest, src) 131 } 132 133 var ( 134 debug bool 135 netList common.NetList 136 interactive bool 137 privateUsers string 138 mdsToken string 139 localhostIP net.IP 140 localConfig string 141 ) 142 143 func init() { 144 flag.BoolVar(&debug, "debug", false, "Run in debug mode") 145 flag.Var(&netList, "net", "Setup networking") 146 flag.BoolVar(&interactive, "interactive", false, "The pod is interactive") 147 flag.StringVar(&privateUsers, "private-users", "", "Run within user namespace. Can be set to [=UIDBASE[:NUIDS]]") 148 flag.StringVar(&mdsToken, "mds-token", "", "MDS auth token") 149 flag.StringVar(&localConfig, "local-config", common.DefaultLocalConfigDir, "Local config path") 150 // this ensures that main runs only on main thread (thread group leader). 151 // since namespace ops (unshare, setns) are done for a single thread, we 152 // must ensure that the goroutine does not jump from OS thread to thread 153 runtime.LockOSThread() 154 155 localhostIP = net.ParseIP("127.0.0.1") 156 if localhostIP == nil { 157 panic("localhost IP failed to parse") 158 } 159 } 160 161 // machinedRegister checks if nspawn should register the pod to machined 162 func machinedRegister() bool { 163 // machined has a D-Bus interface following versioning guidelines, see: 164 // http://www.freedesktop.org/wiki/Software/systemd/machined/ 165 // Therefore we can just check if the D-Bus method we need exists and we 166 // don't need to check the signature. 167 var found int 168 169 conn, err := dbus.SystemBus() 170 if err != nil { 171 return false 172 } 173 node, err := introspect.Call(conn.Object("org.freedesktop.machine1", "/org/freedesktop/machine1")) 174 if err != nil { 175 return false 176 } 177 for _, iface := range node.Interfaces { 178 if iface.Name != "org.freedesktop.machine1.Manager" { 179 continue 180 } 181 // machined v215 supports methods "RegisterMachine" and "CreateMachine" called by nspawn v215. 182 // machined v216+ (since commit 5aa4bb) additionally supports methods "CreateMachineWithNetwork" 183 // and "RegisterMachineWithNetwork", called by nspawn v216+. 184 for _, method := range iface.Methods { 185 if method.Name == "CreateMachineWithNetwork" || method.Name == "RegisterMachineWithNetwork" { 186 found++ 187 } 188 } 189 break 190 } 191 return found == 2 192 } 193 194 func lookupPath(bin string, paths string) (string, error) { 195 pathsArr := filepath.SplitList(paths) 196 for _, path := range pathsArr { 197 binPath := filepath.Join(path, bin) 198 binAbsPath, err := filepath.Abs(binPath) 199 if err != nil { 200 return "", fmt.Errorf("unable to find absolute path for %s", binPath) 201 } 202 d, err := os.Stat(binAbsPath) 203 if err != nil { 204 continue 205 } 206 // Check the executable bit, inspired by os.exec.LookPath() 207 if m := d.Mode(); !m.IsDir() && m&0111 != 0 { 208 return binAbsPath, nil 209 } 210 } 211 return "", fmt.Errorf("unable to find %q in %q", bin, paths) 212 } 213 214 func installAssets() error { 215 systemctlBin, err := lookupPath("systemctl", os.Getenv("PATH")) 216 if err != nil { 217 return err 218 } 219 bashBin, err := lookupPath("bash", os.Getenv("PATH")) 220 if err != nil { 221 return err 222 } 223 // More paths could be added in that list if some Linux distributions install it in a different path 224 // Note that we look in /usr/lib/... first because of the merge: 225 // http://www.freedesktop.org/wiki/Software/systemd/TheCaseForTheUsrMerge/ 226 systemdShutdownBin, err := lookupPath("systemd-shutdown", "/usr/lib/systemd:/lib/systemd") 227 if err != nil { 228 return err 229 } 230 systemdBin, err := lookupPath("systemd", "/usr/lib/systemd:/lib/systemd") 231 if err != nil { 232 return err 233 } 234 systemdJournaldBin, err := lookupPath("systemd-journald", "/usr/lib/systemd:/lib/systemd") 235 if err != nil { 236 return err 237 } 238 239 systemdUnitsPath := "/usr/lib/systemd/system" 240 assets := []string{ 241 proj2aci.GetAssetString("/usr/lib/systemd/systemd", systemdBin), 242 proj2aci.GetAssetString("/usr/bin/systemctl", systemctlBin), 243 proj2aci.GetAssetString("/usr/lib/systemd/systemd-journald", systemdJournaldBin), 244 proj2aci.GetAssetString("/usr/bin/bash", bashBin), 245 proj2aci.GetAssetString(fmt.Sprintf("%s/systemd-journald.service", systemdUnitsPath), fmt.Sprintf("%s/systemd-journald.service", systemdUnitsPath)), 246 proj2aci.GetAssetString(fmt.Sprintf("%s/systemd-journald.socket", systemdUnitsPath), fmt.Sprintf("%s/systemd-journald.socket", systemdUnitsPath)), 247 proj2aci.GetAssetString(fmt.Sprintf("%s/systemd-journald-dev-log.socket", systemdUnitsPath), fmt.Sprintf("%s/systemd-journald-dev-log.socket", systemdUnitsPath)), 248 proj2aci.GetAssetString(fmt.Sprintf("%s/systemd-journald-audit.socket", systemdUnitsPath), fmt.Sprintf("%s/systemd-journald-audit.socket", systemdUnitsPath)), 249 // systemd-shutdown has to be installed at the same path as on the host 250 // because it depends on systemd build flag -DSYSTEMD_SHUTDOWN_BINARY_PATH= 251 proj2aci.GetAssetString(systemdShutdownBin, systemdShutdownBin), 252 } 253 254 return proj2aci.PrepareAssets(assets, "./stage1/rootfs/", nil) 255 } 256 257 // getArgsEnv returns the nspawn or lkvm args and env according to the flavor used 258 func getArgsEnv(p *Pod, flavor string, debug bool, n *networking.Networking) ([]string, []string, error) { 259 var args []string 260 env := os.Environ() 261 262 // We store the pod's flavor so we can later garbage collect it correctly 263 if err := os.Symlink(flavor, filepath.Join(p.Root, flavorFile)); err != nil { 264 return nil, nil, fmt.Errorf("failed to create flavor symlink: %v", err) 265 } 266 267 switch flavor { 268 case "kvm": 269 if privateUsers != "" { 270 return nil, nil, fmt.Errorf("flag --private-users cannot be used with an lkvm stage1") 271 } 272 273 // kernel and lkvm are relative path, because init has /var/lib/rkt/..../uuid as its working directory 274 // TODO: move to path.go 275 kernelPath := filepath.Join(common.Stage1RootfsPath(p.Root), "bzImage") 276 lkvmPath := filepath.Join(common.Stage1RootfsPath(p.Root), "lkvm") 277 netDescriptions := kvm.GetNetworkDescriptions(n) 278 lkvmNetArgs, kernelNetParams, err := kvm.GetKVMNetArgs(netDescriptions) 279 if err != nil { 280 return nil, nil, err 281 } 282 283 // TODO: base on resource isolators 284 cpu := 1 285 mem := 128 286 287 kernelParams := []string{ 288 "console=hvc0", 289 "init=/usr/lib/systemd/systemd", 290 "no_timer_check", 291 "noreplace-smp", 292 "systemd.default_standard_error=journal+console", 293 "systemd.default_standard_output=journal+console", 294 strings.Join(kernelNetParams, " "), 295 // "systemd.default_standard_output=tty", 296 "tsc=reliable", 297 "MACHINEID=" + p.UUID.String(), 298 } 299 300 if debug { 301 kernelParams = append(kernelParams, []string{ 302 "debug", 303 "systemd.log_level=debug", 304 "systemd.show_status=true", 305 // "systemd.confirm_spawn=true", 306 }...) 307 } else { 308 kernelParams = append(kernelParams, "quiet") 309 } 310 311 args = append(args, []string{ 312 "./" + lkvmPath, // relative path 313 "run", 314 "--name", "rkt-" + p.UUID.String(), 315 "--no-dhcp", // speed bootup 316 "--cpu", strconv.Itoa(cpu), 317 "--mem", strconv.Itoa(mem), 318 "--console=virtio", 319 "--kernel", kernelPath, 320 "--disk", "stage1/rootfs", // relative to run/pods/uuid dir this is a place where systemd resides 321 // MACHINEID will be available as environment variable 322 "--params", strings.Join(kernelParams, " "), 323 }..., 324 ) 325 args = append(args, lkvmNetArgs...) 326 327 if debug { 328 args = append(args, "--debug") 329 } 330 331 // host volume sharing with 9p 332 nsargs := kvm.VolumesToKvmDiskArgs(p.Manifest.Volumes) 333 args = append(args, nsargs...) 334 335 // lkvm requires $HOME to be defined, 336 // see https://github.com/coreos/rkt/issues/1393 337 if os.Getenv("HOME") == "" { 338 env = append(env, "HOME=/root") 339 } 340 341 return args, env, nil 342 343 case "coreos": 344 args = append(args, filepath.Join(common.Stage1RootfsPath(p.Root), interpBin)) 345 args = append(args, filepath.Join(common.Stage1RootfsPath(p.Root), nspawnBin)) 346 args = append(args, "--boot") // Launch systemd in the pod 347 348 if context := os.Getenv(common.EnvSELinuxContext); context != "" { 349 args = append(args, fmt.Sprintf("-Z%s", context)) 350 } 351 352 if machinedRegister() { 353 args = append(args, fmt.Sprintf("--register=true")) 354 } else { 355 args = append(args, fmt.Sprintf("--register=false")) 356 } 357 358 // use only dynamic libraries provided in the image 359 env = append(env, "LD_LIBRARY_PATH="+filepath.Join(common.Stage1RootfsPath(p.Root), "usr/lib")) 360 361 case "src": 362 args = append(args, filepath.Join(common.Stage1RootfsPath(p.Root), nspawnBin)) 363 args = append(args, "--boot") // Launch systemd in the pod 364 365 if context := os.Getenv(common.EnvSELinuxContext); context != "" { 366 args = append(args, fmt.Sprintf("-Z%s", context)) 367 } 368 369 if machinedRegister() { 370 args = append(args, fmt.Sprintf("--register=true")) 371 } else { 372 args = append(args, fmt.Sprintf("--register=false")) 373 } 374 375 case "host": 376 hostNspawnBin, err := lookupPath("systemd-nspawn", os.Getenv("PATH")) 377 if err != nil { 378 return nil, nil, err 379 } 380 381 // Check dynamically which version is installed on the host 382 // Support version >= 220 383 versionBytes, err := exec.Command(hostNspawnBin, "--version").CombinedOutput() 384 if err != nil { 385 return nil, nil, fmt.Errorf("unable to probe %s version: %v", hostNspawnBin, err) 386 } 387 versionStr := strings.SplitN(string(versionBytes), "\n", 2)[0] 388 var version int 389 n, err := fmt.Sscanf(versionStr, "systemd %d", &version) 390 if err != nil { 391 return nil, nil, fmt.Errorf("cannot parse version: %q", versionStr) 392 } 393 if n != 1 || version < 220 { 394 return nil, nil, fmt.Errorf("rkt needs systemd-nspawn >= 220. %s version not supported: %v", hostNspawnBin, versionStr) 395 } 396 397 // Copy systemd, bash, etc. in stage1 at run-time 398 if err := installAssets(); err != nil { 399 return nil, nil, fmt.Errorf("cannot install assets from the host: %v", err) 400 } 401 402 args = append(args, hostNspawnBin) 403 args = append(args, "--boot") // Launch systemd in the pod 404 args = append(args, fmt.Sprintf("--register=true")) 405 406 if context := os.Getenv(common.EnvSELinuxContext); context != "" { 407 args = append(args, fmt.Sprintf("-Z%s", context)) 408 } 409 410 default: 411 return nil, nil, fmt.Errorf("unrecognized stage1 flavor: %q", flavor) 412 } 413 414 // link journal only if the host is running systemd 415 if util.IsRunningSystemd() { 416 // we write /etc/machine-id here because systemd-nspawn needs it to link 417 // the container's journal to the host 418 mPath := filepath.Join(common.Stage1RootfsPath(p.Root), "etc", "machine-id") 419 mId := strings.Replace(p.UUID.String(), "-", "", -1) 420 421 if err := ioutil.WriteFile(mPath, []byte(mId), 0644); err != nil { 422 log.Fatalf("error writing /etc/machine-id: %v\n", err) 423 } 424 425 args = append(args, "--link-journal=try-guest") 426 } 427 428 if !debug { 429 args = append(args, "--quiet") // silence most nspawn output (log_warning is currently not covered by this) 430 env = append(env, "SYSTEMD_LOG_LEVEL=err") // silence log_warning too 431 } 432 433 if len(privateUsers) > 0 { 434 args = append(args, "--private-users="+privateUsers) 435 } 436 437 keepUnit, err := isRunningFromUnitFile() 438 if err != nil { 439 return nil, nil, fmt.Errorf("error determining if we're running from a unit file: %v", err) 440 } 441 442 if keepUnit { 443 args = append(args, "--keep-unit") 444 } 445 446 nsargs, err := p.PodToNspawnArgs() 447 if err != nil { 448 return nil, nil, fmt.Errorf("failed to generate nspawn args: %v", err) 449 } 450 args = append(args, nsargs...) 451 452 // Arguments to systemd 453 args = append(args, "--") 454 args = append(args, "--default-standard-output=tty") // redirect all service logs straight to tty 455 if !debug { 456 args = append(args, "--log-target=null") // silence systemd output inside pod 457 // TODO remove --log-level=warning when we update stage1 to systemd v222 458 args = append(args, "--log-level=warning") // limit log output (systemd-shutdown ignores --log-target) 459 args = append(args, "--show-status=0") // silence systemd initialization status output 460 } 461 462 return args, env, nil 463 } 464 465 func withClearedCloExec(lfd int, f func() error) error { 466 err := sys.CloseOnExec(lfd, false) 467 if err != nil { 468 return err 469 } 470 defer sys.CloseOnExec(lfd, true) 471 472 return f() 473 } 474 475 func forwardedPorts(pod *Pod) ([]networking.ForwardedPort, error) { 476 var fps []networking.ForwardedPort 477 478 for _, ep := range pod.Manifest.Ports { 479 n := "" 480 fp := networking.ForwardedPort{} 481 482 for _, a := range pod.Manifest.Apps { 483 for _, p := range a.App.Ports { 484 if p.Name == ep.Name { 485 if n == "" { 486 fp.Protocol = p.Protocol 487 fp.HostPort = ep.HostPort 488 fp.PodPort = p.Port 489 n = a.Name.String() 490 } else { 491 return nil, fmt.Errorf("Ambiguous exposed port in PodManifest: %q and %q both define port %q", n, a.Name, p.Name) 492 } 493 } 494 } 495 } 496 497 if n == "" { 498 return nil, fmt.Errorf("Port name %q is not defined by any apps", ep.Name) 499 } 500 501 fps = append(fps, fp) 502 } 503 504 // TODO(eyakubovich): validate that there're no conflicts 505 506 return fps, nil 507 } 508 509 func writePpid(pid int) error { 510 // write ppid file as specified in 511 // Documentation/devel/stage1-implementors-guide.md 512 out, err := os.Getwd() 513 if err != nil { 514 return fmt.Errorf("Cannot get current working directory: %v\n", err) 515 } 516 // we are the parent of the process that is PID 1 in the container so we write our PID to "ppid" 517 err = ioutil.WriteFile(filepath.Join(out, "ppid"), 518 []byte(fmt.Sprintf("%d\n", pid)), 0644) 519 if err != nil { 520 return fmt.Errorf("Cannot write ppid file: %v\n", err) 521 } 522 return nil 523 } 524 525 func stage1() int { 526 uuid, err := types.NewUUID(flag.Arg(0)) 527 if err != nil { 528 fmt.Fprintln(os.Stderr, "UUID is missing or malformed") 529 return 1 530 } 531 532 root := "." 533 p, err := LoadPod(root, uuid) 534 if err != nil { 535 fmt.Fprintf(os.Stderr, "Failed to load pod: %v\n", err) 536 return 1 537 } 538 539 // set close-on-exec flag on RKT_LOCK_FD so it gets correctly closed when invoking 540 // network plugins 541 lfd, err := common.GetRktLockFD() 542 if err != nil { 543 fmt.Fprintf(os.Stderr, "Failed to get rkt lock fd: %v\n", err) 544 return 1 545 } 546 547 if err := sys.CloseOnExec(lfd, true); err != nil { 548 fmt.Fprintf(os.Stderr, "Failed to set FD_CLOEXEC on rkt lock: %v\n", err) 549 return 1 550 } 551 552 mirrorLocalZoneInfo(p.Root) 553 554 flavor, _, err := p.getFlavor() 555 if err != nil { 556 fmt.Fprintf(os.Stderr, "Failed to get stage1 flavor: %v\n", err) 557 return 3 558 } 559 560 var n *networking.Networking 561 if netList.Contained() { 562 fps, err := forwardedPorts(p) 563 if err != nil { 564 fmt.Fprintln(os.Stderr, err.Error()) 565 return 6 566 } 567 568 n, err = networking.Setup(root, p.UUID, fps, netList, localConfig, flavor) 569 if err != nil { 570 fmt.Fprintf(os.Stderr, "Failed to setup network: %v\n", err) 571 return 6 572 } 573 574 if err = n.Save(); err != nil { 575 fmt.Fprintf(os.Stderr, "Failed to save networking state %v\n", err) 576 n.Teardown(flavor) 577 return 6 578 } 579 580 if len(mdsToken) > 0 { 581 hostIP, err := n.GetDefaultHostIP() 582 if err != nil { 583 fmt.Fprintf(os.Stderr, "Failed to get default Host IP: %v\n", err) 584 return 6 585 } 586 587 p.MetadataServiceURL = common.MetadataServicePublicURL(hostIP, mdsToken) 588 } 589 } else { 590 if flavor == "kvm" { 591 fmt.Fprintf(os.Stderr, "Flavor kvm requires private network configuration (try --net).\n") 592 return 6 593 } 594 if len(mdsToken) > 0 { 595 p.MetadataServiceURL = common.MetadataServicePublicURL(localhostIP, mdsToken) 596 } 597 } 598 599 if err = p.WriteDefaultTarget(); err != nil { 600 fmt.Fprintf(os.Stderr, "Failed to write default.target: %v\n", err) 601 return 2 602 } 603 604 if err = p.WritePrepareAppTemplate(); err != nil { 605 fmt.Fprintf(os.Stderr, "Failed to write prepare-app service template: %v\n", err) 606 return 2 607 } 608 609 if err = p.PodToSystemd(interactive, flavor, privateUsers); err != nil { 610 fmt.Fprintf(os.Stderr, "Failed to configure systemd: %v\n", err) 611 return 2 612 } 613 614 args, env, err := getArgsEnv(p, flavor, debug, n) 615 if err != nil { 616 fmt.Fprintf(os.Stderr, "Error: %v\n", err) 617 return 3 618 } 619 620 // create a separate mount namespace so the cgroup filesystems 621 // are unmounted when exiting the pod 622 if err := syscall.Unshare(syscall.CLONE_NEWNS); err != nil { 623 log.Fatalf("Error unsharing: %v", err) 624 } 625 626 // we recursively make / a "shared and slave" so mount events from the 627 // new namespace don't propagate to the host namespace but mount events 628 // from the host propagate to the new namespace and are forwarded to 629 // its peer group 630 // See https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt 631 if err := syscall.Mount("", "/", "none", syscall.MS_REC|syscall.MS_SLAVE, ""); err != nil { 632 log.Fatalf("Error making / a slave mount: %v", err) 633 } 634 if err := syscall.Mount("", "/", "none", syscall.MS_REC|syscall.MS_SHARED, ""); err != nil { 635 log.Fatalf("Error making / a shared and slave mount: %v", err) 636 } 637 638 enabledCgroups, err := cgroup.GetEnabledCgroups() 639 if err != nil { 640 fmt.Fprintf(os.Stderr, "Error getting cgroups: %v", err) 641 return 5 642 } 643 644 // mount host cgroups in the rkt mount namespace 645 if err := mountHostCgroups(enabledCgroups); err != nil { 646 log.Fatalf("Couldn't mount the host cgroups: %v\n", err) 647 return 5 648 } 649 650 var serviceNames []string 651 for _, app := range p.Manifest.Apps { 652 serviceNames = append(serviceNames, ServiceUnitName(app.Name)) 653 } 654 s1Root := common.Stage1RootfsPath(p.Root) 655 machineID := p.GetMachineID() 656 subcgroup, err := getContainerSubCgroup(machineID) 657 if err == nil { 658 if err := mountContainerCgroups(s1Root, enabledCgroups, subcgroup, serviceNames); err != nil { 659 fmt.Fprintf(os.Stderr, "Couldn't mount the container cgroups: %v\n", err) 660 return 5 661 } 662 } else { 663 fmt.Fprintf(os.Stderr, "Continuing with per-app isolators disabled: %v\n", err) 664 } 665 666 if err = writePpid(os.Getpid()); err != nil { 667 fmt.Fprintln(os.Stderr, err.Error()) 668 return 4 669 } 670 671 err = withClearedCloExec(lfd, func() error { 672 return syscall.Exec(args[0], args, env) 673 }) 674 if err != nil { 675 fmt.Fprintf(os.Stderr, "Failed to execute %q: %v\n", args[0], err) 676 return 7 677 } 678 679 return 0 680 } 681 682 func areHostCgroupsMounted(enabledCgroups map[int][]string) bool { 683 controllers := cgroup.GetControllerDirs(enabledCgroups) 684 for _, c := range controllers { 685 if !cgroup.IsControllerMounted(c) { 686 return false 687 } 688 } 689 690 return true 691 } 692 693 // mountHostCgroups mounts the host cgroup hierarchy as required by 694 // systemd-nspawn. We need this because some distributions don't have the 695 // "name=systemd" cgroup or don't mount the cgroup controllers in 696 // "/sys/fs/cgroup", and systemd-nspawn needs this. Since this is mounted 697 // inside the rkt mount namespace, it doesn't affect the host. 698 func mountHostCgroups(enabledCgroups map[int][]string) error { 699 systemdControllerPath := "/sys/fs/cgroup/systemd" 700 if !areHostCgroupsMounted(enabledCgroups) { 701 if err := cgroup.CreateCgroups("/", enabledCgroups); err != nil { 702 return fmt.Errorf("error creating host cgroups: %v\n", err) 703 } 704 } 705 706 if !cgroup.IsControllerMounted("systemd") { 707 if err := os.MkdirAll(systemdControllerPath, 0700); err != nil { 708 return err 709 } 710 if err := syscall.Mount("cgroup", systemdControllerPath, "cgroup", 0, "none,name=systemd"); err != nil { 711 return fmt.Errorf("error mounting name=systemd hierarchy on %q: %v", systemdControllerPath, err) 712 } 713 } 714 715 return nil 716 } 717 718 // mountContainerCgroups mounts the cgroup controllers hierarchy in the container's 719 // namespace read-only, leaving the needed knobs in the subcgroup for each-app 720 // read-write so systemd inside stage1 can apply isolators to them 721 func mountContainerCgroups(s1Root string, enabledCgroups map[int][]string, subcgroup string, serviceNames []string) error { 722 if err := cgroup.CreateCgroups(s1Root, enabledCgroups); err != nil { 723 return fmt.Errorf("error creating container cgroups: %v\n", err) 724 } 725 if err := cgroup.RemountCgroupsRO(s1Root, enabledCgroups, subcgroup, serviceNames); err != nil { 726 return fmt.Errorf("error restricting container cgroups: %v\n", err) 727 } 728 729 return nil 730 } 731 732 func getContainerSubCgroup(machineID string) (string, error) { 733 var subcgroup string 734 fromUnit, err := isRunningFromUnitFile() 735 if err != nil { 736 return "", fmt.Errorf("could not determine if we're running from a unit file: %v", err) 737 } 738 if fromUnit { 739 slice, err := getSlice() 740 if err != nil { 741 return "", fmt.Errorf("could not get slice name: %v", err) 742 } 743 slicePath, err := common.SliceToPath(slice) 744 if err != nil { 745 return "", fmt.Errorf("could not convert slice name to path: %v", err) 746 } 747 unit, err := getUnitFileName() 748 if err != nil { 749 return "", fmt.Errorf("could not get unit name: %v", err) 750 } 751 subcgroup = filepath.Join(slicePath, unit, "system.slice") 752 } else { 753 if machinedRegister() { 754 // we are not in the final cgroup yet: systemd-nspawn will move us 755 // to the correct cgroup later during registration so we can't 756 // look it up in /proc/self/cgroup 757 escapedmID := strings.Replace(machineID, "-", "\\x2d", -1) 758 machineDir := "machine-" + escapedmID + ".scope" 759 subcgroup = filepath.Join("machine.slice", machineDir, "system.slice") 760 } else { 761 // when registration is disabled the container will be directly 762 // under rkt's cgroup so we can look it up in /proc/self/cgroup 763 ownCgroupPath, err := cgroup.GetOwnCgroupPath("name=systemd") 764 if err != nil { 765 return "", fmt.Errorf("could not get own cgroup path: %v", err) 766 } 767 // systemd-nspawn won't work unless we're in a subcgroup. If we're 768 // in the root cgroup, we create a "rkt" subcgroup and we add 769 // ourselves to it 770 if ownCgroupPath == "/" { 771 ownCgroupPath = "/rkt" 772 if err := cgroup.JoinSubcgroup("systemd", ownCgroupPath); err != nil { 773 return "", fmt.Errorf("error joining %s subcgroup: %v", ownCgroupPath, err) 774 } 775 } 776 subcgroup = filepath.Join(ownCgroupPath, "system.slice") 777 } 778 } 779 780 return subcgroup, nil 781 } 782 783 func getUnitFileName() (unit string, err error) { 784 libname := C.CString("libsystemd.so") 785 defer C.free(unsafe.Pointer(libname)) 786 handle := C.dlopen(libname, C.RTLD_LAZY) 787 if handle == nil { 788 err = fmt.Errorf("error opening libsystemd.so") 789 return 790 } 791 defer func() { 792 if r := C.dlclose(handle); r != 0 { 793 err = fmt.Errorf("error closing libsystemd.so") 794 } 795 }() 796 797 sym := C.CString("sd_pid_get_unit") 798 defer C.free(unsafe.Pointer(sym)) 799 sd_pid_get_unit := C.dlsym(handle, sym) 800 if sd_pid_get_unit == nil { 801 err = fmt.Errorf("error resolving sd_pid_get_unit function") 802 return 803 } 804 805 var s string 806 u := C.CString(s) 807 defer C.free(unsafe.Pointer(u)) 808 809 ret := C.my_sd_pid_get_unit(sd_pid_get_unit, 0, &u) 810 if ret < 0 { 811 err = fmt.Errorf("error calling sd_pid_get_unit: %v", syscall.Errno(-ret)) 812 return 813 } 814 815 unit = C.GoString(u) 816 return 817 } 818 819 func getSlice() (slice string, err error) { 820 libname := C.CString("libsystemd.so") 821 defer C.free(unsafe.Pointer(libname)) 822 handle := C.dlopen(libname, C.RTLD_LAZY) 823 if handle == nil { 824 err = fmt.Errorf("error opening libsystemd.so") 825 return 826 } 827 defer func() { 828 if r := C.dlclose(handle); r != 0 { 829 err = fmt.Errorf("error closing libsystemd.so") 830 } 831 }() 832 833 sym := C.CString("sd_pid_get_slice") 834 defer C.free(unsafe.Pointer(sym)) 835 sd_pid_get_slice := C.dlsym(handle, sym) 836 if sd_pid_get_slice == nil { 837 err = fmt.Errorf("error resolving sd_pid_get_slice function") 838 return 839 } 840 841 var s string 842 sl := C.CString(s) 843 defer C.free(unsafe.Pointer(sl)) 844 845 ret := C.my_sd_pid_get_slice(sd_pid_get_slice, 0, &sl) 846 if ret < 0 { 847 err = fmt.Errorf("error calling sd_pid_get_slice: %v", syscall.Errno(-ret)) 848 return 849 } 850 851 slice = C.GoString(sl) 852 return 853 } 854 855 func isRunningFromUnitFile() (ret bool, err error) { 856 libname := C.CString("libsystemd.so") 857 defer C.free(unsafe.Pointer(libname)) 858 handle := C.dlopen(libname, C.RTLD_LAZY) 859 if handle == nil { 860 // we can't open libsystemd.so so we assume systemd is not 861 // installed and we're not running from a unit file 862 return 863 } 864 defer func() { 865 if r := C.dlclose(handle); r != 0 { 866 err = fmt.Errorf("error closing libsystemd.so") 867 } 868 }() 869 870 sd_pid_get_owner_uid := C.dlsym(handle, C.CString("sd_pid_get_owner_uid")) 871 if sd_pid_get_owner_uid == nil { 872 err = fmt.Errorf("error resolving sd_pid_get_owner_uid function") 873 return 874 } 875 876 var uid C.uid_t 877 errno := C.my_sd_pid_get_owner_uid(sd_pid_get_owner_uid, 0, &uid) 878 // when we're running from a unit file, sd_pid_get_owner_uid returns 879 // ENOENT (systemd <220) or ENXIO (systemd >=220) 880 switch { 881 case errno >= 0: 882 case syscall.Errno(-errno) == syscall.ENOENT || syscall.Errno(-errno) == syscall.ENXIO: 883 if C.am_session_leader() == 1 { 884 ret = true 885 } 886 default: 887 err = fmt.Errorf("error calling sd_pid_get_owner_uid: %v", syscall.Errno(-errno)) 888 } 889 return 890 } 891 892 func main() { 893 flag.Parse() 894 895 if !debug { 896 log.SetOutput(ioutil.Discard) 897 } 898 899 // move code into stage1() helper so defered fns get run 900 os.Exit(stage1()) 901 }