github.com/stackdocker/rkt@v0.10.1-0.20151109095037-1aa827478248/stage1/init/pod.go (about) 1 // Copyright 2014 The rkt Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 //+build linux 16 17 package main 18 19 import ( 20 "bytes" 21 "encoding/json" 22 "fmt" 23 "io" 24 "io/ioutil" 25 "os" 26 "path" 27 "path/filepath" 28 "strconv" 29 "strings" 30 31 "github.com/coreos/rkt/Godeps/_workspace/src/github.com/appc/spec/schema" 32 "github.com/coreos/rkt/Godeps/_workspace/src/github.com/appc/spec/schema/types" 33 "github.com/coreos/rkt/Godeps/_workspace/src/github.com/coreos/go-systemd/unit" 34 "github.com/coreos/rkt/common" 35 "github.com/coreos/rkt/common/cgroup" 36 "github.com/coreos/rkt/pkg/uid" 37 initcommon "github.com/coreos/rkt/stage1/init/common" 38 "github.com/coreos/rkt/stage1/init/kvm" 39 ) 40 41 // Pod encapsulates a PodManifest and ImageManifests 42 type Pod struct { 43 Root string // root directory where the pod will be located 44 UUID types.UUID 45 Manifest *schema.PodManifest 46 Images map[string]*schema.ImageManifest 47 MetadataServiceURL string 48 Networks []string 49 } 50 51 const ( 52 // Name of the file storing the pod's flavor 53 flavorFile = "flavor" 54 sharedVolPerm = os.FileMode(0755) 55 ) 56 57 var ( 58 defaultEnv = map[string]string{ 59 "PATH": "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", 60 "SHELL": "/bin/sh", 61 "USER": "root", 62 "LOGNAME": "root", 63 "HOME": "/root", 64 } 65 ) 66 67 // LoadPod loads a Pod Manifest (as prepared by stage0) and 68 // its associated Application Manifests, under $root/stage1/opt/stage1/$apphash 69 func LoadPod(root string, uuid *types.UUID) (*Pod, error) { 70 p := &Pod{ 71 Root: root, 72 UUID: *uuid, 73 Images: make(map[string]*schema.ImageManifest), 74 } 75 76 buf, err := ioutil.ReadFile(common.PodManifestPath(p.Root)) 77 if err != nil { 78 return nil, fmt.Errorf("failed reading pod manifest: %v", err) 79 } 80 81 pm := &schema.PodManifest{} 82 if err := json.Unmarshal(buf, pm); err != nil { 83 return nil, fmt.Errorf("failed unmarshalling pod manifest: %v", err) 84 } 85 p.Manifest = pm 86 87 for i, app := range p.Manifest.Apps { 88 ampath := common.ImageManifestPath(p.Root, app.Name) 89 buf, err := ioutil.ReadFile(ampath) 90 if err != nil { 91 return nil, fmt.Errorf("failed reading app manifest %q: %v", ampath, err) 92 } 93 94 am := &schema.ImageManifest{} 95 if err = json.Unmarshal(buf, am); err != nil { 96 return nil, fmt.Errorf("failed unmarshalling app manifest %q: %v", ampath, err) 97 } 98 99 if _, ok := p.Images[app.Name.String()]; ok { 100 return nil, fmt.Errorf("got multiple definitions for app: %v", app.Name) 101 } 102 if app.App == nil { 103 p.Manifest.Apps[i].App = am.App 104 } 105 p.Images[app.Name.String()] = am 106 } 107 108 return p, nil 109 } 110 111 func simpleEscape(str string) string { 112 esc := strings.Replace(str, `\`, `\\`, -1) 113 esc = strings.Replace(esc, `"`, `\"`, -1) 114 esc = strings.Replace(esc, `'`, `\'`, -1) 115 116 return esc 117 } 118 119 // quoteExec returns an array of quoted strings appropriate for systemd execStart usage 120 func quoteExec(exec []string) string { 121 if len(exec) == 0 { 122 // existing callers prefix {"/appexec", "/app/root", "/work/dir", "/env/file"} so this shouldn't occur. 123 panic("empty exec") 124 } 125 126 var qexec []string 127 escExec := simpleEscape(exec[0]) 128 qexec = append(qexec, `"`+escExec+`"`) 129 130 if len(exec) > 1 { 131 for _, arg := range exec[1:] { 132 escArg := simpleEscape(arg) 133 escArg = strings.Replace(escArg, `$`, `$$`, -1) 134 qexec = append(qexec, `"`+escArg+`"`) 135 } 136 } 137 138 return strings.Join(qexec, " ") 139 } 140 141 func (p *Pod) WriteDefaultTarget() error { 142 opts := []*unit.UnitOption{ 143 unit.NewUnitOption("Unit", "Description", "rkt apps target"), 144 unit.NewUnitOption("Unit", "DefaultDependencies", "false"), 145 } 146 147 for i := range p.Manifest.Apps { 148 ra := &p.Manifest.Apps[i] 149 serviceName := ServiceUnitName(ra.Name) 150 opts = append(opts, unit.NewUnitOption("Unit", "After", serviceName)) 151 opts = append(opts, unit.NewUnitOption("Unit", "Wants", serviceName)) 152 } 153 154 unitsPath := filepath.Join(common.Stage1RootfsPath(p.Root), unitsDir) 155 file, err := os.OpenFile(filepath.Join(unitsPath, "default.target"), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644) 156 if err != nil { 157 return err 158 } 159 defer file.Close() 160 161 if _, err = io.Copy(file, unit.Serialize(opts)); err != nil { 162 return err 163 } 164 165 return nil 166 } 167 168 func (p *Pod) WritePrepareAppTemplate() error { 169 opts := []*unit.UnitOption{ 170 unit.NewUnitOption("Unit", "Description", "Prepare minimum environment for chrooted applications"), 171 unit.NewUnitOption("Unit", "DefaultDependencies", "false"), 172 unit.NewUnitOption("Unit", "OnFailureJobMode", "fail"), 173 unit.NewUnitOption("Unit", "Requires", "systemd-journald.service"), 174 unit.NewUnitOption("Unit", "After", "systemd-journald.service"), 175 unit.NewUnitOption("Service", "Type", "oneshot"), 176 unit.NewUnitOption("Service", "Restart", "no"), 177 unit.NewUnitOption("Service", "ExecStart", "/prepare-app %I"), 178 unit.NewUnitOption("Service", "User", "0"), 179 unit.NewUnitOption("Service", "Group", "0"), 180 unit.NewUnitOption("Service", "CapabilityBoundingSet", "CAP_SYS_ADMIN CAP_DAC_OVERRIDE"), 181 } 182 183 unitsPath := filepath.Join(common.Stage1RootfsPath(p.Root), unitsDir) 184 file, err := os.OpenFile(filepath.Join(unitsPath, "prepare-app@.service"), os.O_WRONLY|os.O_CREATE, 0644) 185 if err != nil { 186 return fmt.Errorf("failed to create service unit file: %v", err) 187 } 188 defer file.Close() 189 190 if _, err = io.Copy(file, unit.Serialize(opts)); err != nil { 191 return fmt.Errorf("failed to write service unit file: %v", err) 192 } 193 194 return nil 195 } 196 197 func (p *Pod) writeAppReaper(appName string) error { 198 opts := []*unit.UnitOption{ 199 unit.NewUnitOption("Unit", "Description", fmt.Sprintf("%s Reaper", appName)), 200 unit.NewUnitOption("Unit", "DefaultDependencies", "false"), 201 unit.NewUnitOption("Unit", "StopWhenUnneeded", "yes"), 202 unit.NewUnitOption("Unit", "Wants", "shutdown.service"), 203 unit.NewUnitOption("Unit", "After", "shutdown.service"), 204 unit.NewUnitOption("Unit", "Conflicts", "exit.target"), 205 unit.NewUnitOption("Unit", "Conflicts", "halt.target"), 206 unit.NewUnitOption("Unit", "Conflicts", "poweroff.target"), 207 unit.NewUnitOption("Service", "RemainAfterExit", "yes"), 208 unit.NewUnitOption("Service", "ExecStop", fmt.Sprintf("/reaper.sh %s", appName)), 209 } 210 211 unitsPath := filepath.Join(common.Stage1RootfsPath(p.Root), unitsDir) 212 file, err := os.OpenFile(filepath.Join(unitsPath, fmt.Sprintf("reaper-%s.service", appName)), os.O_WRONLY|os.O_CREATE, 0644) 213 if err != nil { 214 return fmt.Errorf("failed to create service unit file: %v", err) 215 } 216 defer file.Close() 217 218 if _, err = io.Copy(file, unit.Serialize(opts)); err != nil { 219 return fmt.Errorf("failed to write service unit file: %v", err) 220 } 221 222 return nil 223 } 224 225 func generateGidArg(gid int, supplGid []int) string { 226 arg := []string{strconv.Itoa(gid)} 227 for _, sg := range supplGid { 228 arg = append(arg, strconv.Itoa(sg)) 229 } 230 return strings.Join(arg, ",") 231 } 232 233 // appToSystemd transforms the provided RuntimeApp+ImageManifest into systemd units 234 func (p *Pod) appToSystemd(ra *schema.RuntimeApp, interactive bool, flavor string, privateUsers string) error { 235 app := ra.App 236 appName := ra.Name 237 image, ok := p.Images[appName.String()] 238 if !ok { 239 // This is impossible as we have updated the map in LoadPod(). 240 panic(fmt.Sprintf("No images for app %q", ra.Name.String())) 241 } 242 imgName := image.Name 243 244 workDir := "/" 245 if app.WorkingDirectory != "" { 246 workDir = app.WorkingDirectory 247 } 248 249 env := app.Environment 250 251 env.Set("AC_APP_NAME", appName.String()) 252 if p.MetadataServiceURL != "" { 253 env.Set("AC_METADATA_URL", p.MetadataServiceURL) 254 } 255 256 if err := p.writeEnvFile(env, appName, privateUsers); err != nil { 257 return fmt.Errorf("unable to write environment file: %v", err) 258 } 259 260 // This is a partial implementation for app.User and app.Group: 261 // For now, only numeric ids (and the string "root") are supported. 262 var uid, gid int 263 var err error 264 if app.User == "root" { 265 uid = 0 266 } else { 267 uid, err = strconv.Atoi(app.User) 268 if err != nil { 269 return fmt.Errorf("non-numerical user id not supported yet") 270 } 271 } 272 if app.Group == "root" { 273 gid = 0 274 } else { 275 gid, err = strconv.Atoi(app.Group) 276 if err != nil { 277 return fmt.Errorf("non-numerical group id not supported yet") 278 } 279 } 280 281 execWrap := []string{"/appexec", common.RelAppRootfsPath(appName), workDir, RelEnvFilePath(appName), strconv.Itoa(uid), generateGidArg(gid, app.SupplementaryGIDs)} 282 execStart := quoteExec(append(execWrap, app.Exec...)) 283 opts := []*unit.UnitOption{ 284 unit.NewUnitOption("Unit", "Description", fmt.Sprintf("Application=%v Image=%v", appName, imgName)), 285 unit.NewUnitOption("Unit", "DefaultDependencies", "false"), 286 unit.NewUnitOption("Unit", "Wants", fmt.Sprintf("reaper-%s.service", appName)), 287 unit.NewUnitOption("Service", "Restart", "no"), 288 unit.NewUnitOption("Service", "ExecStart", execStart), 289 unit.NewUnitOption("Service", "User", "0"), 290 unit.NewUnitOption("Service", "Group", "0"), 291 } 292 293 if interactive { 294 opts = append(opts, unit.NewUnitOption("Service", "StandardInput", "tty")) 295 opts = append(opts, unit.NewUnitOption("Service", "StandardOutput", "tty")) 296 opts = append(opts, unit.NewUnitOption("Service", "StandardError", "tty")) 297 } else { 298 opts = append(opts, unit.NewUnitOption("Service", "StandardOutput", "journal+console")) 299 opts = append(opts, unit.NewUnitOption("Service", "StandardError", "journal+console")) 300 opts = append(opts, unit.NewUnitOption("Service", "SyslogIdentifier", filepath.Base(app.Exec[0]))) 301 } 302 303 // When an app fails, we shut down the pod 304 opts = append(opts, unit.NewUnitOption("Unit", "OnFailure", "halt.target")) 305 306 for _, eh := range app.EventHandlers { 307 var typ string 308 switch eh.Name { 309 case "pre-start": 310 typ = "ExecStartPre" 311 case "post-stop": 312 typ = "ExecStopPost" 313 default: 314 return fmt.Errorf("unrecognized eventHandler: %v", eh.Name) 315 } 316 exec := quoteExec(append(execWrap, eh.Exec...)) 317 opts = append(opts, unit.NewUnitOption("Service", typ, exec)) 318 } 319 320 // Some pre-start jobs take a long time, set the timeout to 0 321 opts = append(opts, unit.NewUnitOption("Service", "TimeoutStartSec", "0")) 322 323 var saPorts []types.Port 324 for _, p := range app.Ports { 325 if p.SocketActivated { 326 saPorts = append(saPorts, p) 327 } 328 } 329 330 for _, i := range app.Isolators { 331 switch v := i.Value().(type) { 332 case *types.ResourceMemory: 333 opts, err = cgroup.MaybeAddIsolator(opts, "memory", v.Limit()) 334 if err != nil { 335 return err 336 } 337 case *types.ResourceCPU: 338 opts, err = cgroup.MaybeAddIsolator(opts, "cpu", v.Limit()) 339 if err != nil { 340 return err 341 } 342 } 343 } 344 345 if len(saPorts) > 0 { 346 sockopts := []*unit.UnitOption{ 347 unit.NewUnitOption("Unit", "Description", fmt.Sprintf("Application=%v Image=%v %s", appName, imgName, "socket-activated ports")), 348 unit.NewUnitOption("Unit", "DefaultDependencies", "false"), 349 unit.NewUnitOption("Socket", "BindIPv6Only", "both"), 350 unit.NewUnitOption("Socket", "Service", ServiceUnitName(appName)), 351 } 352 353 for _, sap := range saPorts { 354 var proto string 355 switch sap.Protocol { 356 case "tcp": 357 proto = "ListenStream" 358 case "udp": 359 proto = "ListenDatagram" 360 default: 361 return fmt.Errorf("unrecognized protocol: %v", sap.Protocol) 362 } 363 sockopts = append(sockopts, unit.NewUnitOption("Socket", proto, fmt.Sprintf("%v", sap.Port))) 364 } 365 366 file, err := os.OpenFile(SocketUnitPath(p.Root, appName), os.O_WRONLY|os.O_CREATE, 0644) 367 if err != nil { 368 return fmt.Errorf("failed to create socket file: %v", err) 369 } 370 defer file.Close() 371 372 if _, err = io.Copy(file, unit.Serialize(sockopts)); err != nil { 373 return fmt.Errorf("failed to write socket unit file: %v", err) 374 } 375 376 if err = os.Symlink(path.Join("..", SocketUnitName(appName)), SocketWantPath(p.Root, appName)); err != nil { 377 return fmt.Errorf("failed to link socket want: %v", err) 378 } 379 380 opts = append(opts, unit.NewUnitOption("Unit", "Requires", SocketUnitName(appName))) 381 } 382 383 opts = append(opts, unit.NewUnitOption("Unit", "Requires", InstantiatedPrepareAppUnitName(appName))) 384 opts = append(opts, unit.NewUnitOption("Unit", "After", InstantiatedPrepareAppUnitName(appName))) 385 386 file, err := os.OpenFile(ServiceUnitPath(p.Root, appName), os.O_WRONLY|os.O_CREATE, 0644) 387 if err != nil { 388 return fmt.Errorf("failed to create service unit file: %v", err) 389 } 390 defer file.Close() 391 392 if _, err = io.Copy(file, unit.Serialize(opts)); err != nil { 393 return fmt.Errorf("failed to write service unit file: %v", err) 394 } 395 396 if err = os.Symlink(path.Join("..", ServiceUnitName(appName)), ServiceWantPath(p.Root, appName)); err != nil { 397 return fmt.Errorf("failed to link service want: %v", err) 398 } 399 400 if flavor == "kvm" { 401 // bind mount all shared volumes from /mnt/volumeName (we don't use mechanism for bind-mounting given by nspawn) 402 err := kvm.AppToSystemdMountUnits(common.Stage1RootfsPath(p.Root), appName, p.Manifest.Volumes, ra, unitsDir) 403 if err != nil { 404 return fmt.Errorf("failed to prepare mount units: %v", err) 405 } 406 407 } 408 409 if err = p.writeAppReaper(appName.String()); err != nil { 410 return fmt.Errorf("Failed to write app %q reaper service: %v\n", appName, err) 411 } 412 413 return nil 414 } 415 416 // writeEnvFile creates an environment file for given app name, the minimum 417 // required environment variables by the appc spec will be set to sensible 418 // defaults here if they're not provided by env. 419 func (p *Pod) writeEnvFile(env types.Environment, appName types.ACName, privateUsers string) error { 420 ef := bytes.Buffer{} 421 422 for dk, dv := range defaultEnv { 423 if _, exists := env.Get(dk); !exists { 424 fmt.Fprintf(&ef, "%s=%s\000", dk, dv) 425 } 426 } 427 428 for _, e := range env { 429 fmt.Fprintf(&ef, "%s=%s\000", e.Name, e.Value) 430 } 431 432 uidRange := uid.NewBlankUidRange() 433 if err := uidRange.Deserialize([]byte(privateUsers)); err != nil { 434 return err 435 } 436 437 envFilePath := EnvFilePath(p.Root, appName) 438 if err := ioutil.WriteFile(envFilePath, ef.Bytes(), 0644); err != nil { 439 return err 440 } 441 442 if uidRange.Shift != 0 && uidRange.Count != 0 { 443 if err := os.Chown(envFilePath, int(uidRange.Shift), int(uidRange.Shift)); err != nil { 444 return err 445 } 446 } 447 448 return nil 449 } 450 451 // PodToSystemd creates the appropriate systemd service unit files for 452 // all the constituent apps of the Pod 453 func (p *Pod) PodToSystemd(interactive bool, flavor string, privateUsers string) error { 454 455 if flavor == "kvm" { 456 // prepare all applications names to become dependency for mount units 457 // all host-shared folder has to become available before applications starts 458 var appNames []types.ACName 459 for _, runtimeApp := range p.Manifest.Apps { 460 appNames = append(appNames, runtimeApp.Name) 461 } 462 463 // mount host volumes through some remote file system e.g. 9p to /mnt/volumeName location 464 // order is important here: podToSystemHostMountUnits prepares folders that are checked by each appToSystemdMountUnits later 465 err := kvm.PodToSystemdHostMountUnits(common.Stage1RootfsPath(p.Root), p.Manifest.Volumes, appNames, unitsDir) 466 if err != nil { 467 return fmt.Errorf("failed to transform pod volumes into mount units: %v", err) 468 } 469 } 470 471 for i := range p.Manifest.Apps { 472 ra := &p.Manifest.Apps[i] 473 if err := p.appToSystemd(ra, interactive, flavor, privateUsers); err != nil { 474 return fmt.Errorf("failed to transform app %q into systemd service: %v", ra.Name, err) 475 } 476 } 477 return nil 478 } 479 480 // appToNspawnArgs transforms the given app manifest, with the given associated 481 // app name, into a subset of applicable systemd-nspawn argument 482 func (p *Pod) appToNspawnArgs(ra *schema.RuntimeApp) ([]string, error) { 483 var args []string 484 appName := ra.Name 485 app := ra.App 486 487 sharedVolPath := common.SharedVolumesPath(p.Root) 488 if err := os.MkdirAll(sharedVolPath, sharedVolPerm); err != nil { 489 return nil, fmt.Errorf("could not create shared volumes directory: %v", err) 490 } 491 if err := os.Chmod(sharedVolPath, sharedVolPerm); err != nil { 492 return nil, fmt.Errorf("could not change permissions of %q: %v", sharedVolPath, err) 493 } 494 495 vols := make(map[types.ACName]types.Volume) 496 for _, v := range p.Manifest.Volumes { 497 vols[v.Name] = v 498 499 if v.Kind == "empty" { 500 if err := os.MkdirAll(filepath.Join(sharedVolPath, v.Name.String()), sharedVolPerm); err != nil { 501 return nil, fmt.Errorf("could not create shared volume %q: %v", v.Name, err) 502 } 503 } 504 } 505 506 mounts, err := initcommon.GenerateMounts(ra, vols) 507 if err != nil { 508 return nil, err 509 } 510 511 for _, m := range mounts { 512 vol := vols[m.Volume] 513 514 opt := make([]string, 4) 515 516 if initcommon.IsMountReadOnly(vol, app.MountPoints) { 517 opt[0] = "--bind-ro=" 518 } else { 519 opt[0] = "--bind=" 520 } 521 522 switch vol.Kind { 523 case "host": 524 opt[1] = vol.Source 525 case "empty": 526 absRoot, err := filepath.Abs(p.Root) 527 if err != nil { 528 return nil, fmt.Errorf("cannot get pod's root absolute path: %v\n", err) 529 } 530 opt[1] = filepath.Join(common.SharedVolumesPath(absRoot), vol.Name.String()) 531 default: 532 return nil, fmt.Errorf(`invalid volume kind %q. Must be one of "host" or "empty".`, vol.Kind) 533 } 534 opt[2] = ":" 535 opt[3] = filepath.Join(common.RelAppRootfsPath(appName), m.Path) 536 537 args = append(args, strings.Join(opt, "")) 538 } 539 540 for _, i := range app.Isolators { 541 switch v := i.Value().(type) { 542 case types.LinuxCapabilitiesSet: 543 var caps []string 544 // TODO: cleanup the API on LinuxCapabilitiesSet to give strings easily. 545 for _, c := range v.Set() { 546 caps = append(caps, string(c)) 547 } 548 if i.Name == types.LinuxCapabilitiesRetainSetName { 549 capList := strings.Join(caps, ",") 550 args = append(args, "--capability="+capList) 551 } 552 } 553 } 554 555 return args, nil 556 } 557 558 // PodToNspawnArgs renders a prepared Pod as a systemd-nspawn 559 // argument list ready to be executed 560 func (p *Pod) PodToNspawnArgs() ([]string, error) { 561 args := []string{ 562 "--uuid=" + p.UUID.String(), 563 "--machine=" + p.GetMachineID(), 564 "--directory=" + common.Stage1RootfsPath(p.Root), 565 } 566 567 for i := range p.Manifest.Apps { 568 aa, err := p.appToNspawnArgs(&p.Manifest.Apps[i]) 569 if err != nil { 570 return nil, err 571 } 572 args = append(args, aa...) 573 } 574 575 return args, nil 576 } 577 578 func (p *Pod) getFlavor() (flavor string, systemdVersion string, err error) { 579 flavor, err = os.Readlink(filepath.Join(common.Stage1RootfsPath(p.Root), "flavor")) 580 if err != nil { 581 return "", "", fmt.Errorf("unable to determine stage1 flavor: %v", err) 582 } 583 584 if flavor == "host" { 585 // This flavor does not contain systemd, so don't return systemdVersion 586 return flavor, "", nil 587 } 588 589 systemdVersionBytes, err := ioutil.ReadFile(filepath.Join(common.Stage1RootfsPath(p.Root), "systemd-version")) 590 if err != nil { 591 return "", "", fmt.Errorf("unable to determine stage1's systemd version: %v", err) 592 } 593 systemdVersion = strings.Trim(string(systemdVersionBytes), " \n") 594 return flavor, systemdVersion, nil 595 } 596 597 // GetAppHashes returns a list of hashes of the apps in this pod 598 func (p *Pod) GetAppHashes() []types.Hash { 599 var names []types.Hash 600 for _, a := range p.Manifest.Apps { 601 names = append(names, a.Image.ID) 602 } 603 604 return names 605 } 606 607 // GetMachineID returns the machine id string of the pod to be passed to 608 // systemd-nspawn 609 func (p *Pod) GetMachineID() string { 610 return "rkt-" + p.UUID.String() 611 }