github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/runsc/boot/controller.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package boot 16 17 import ( 18 "errors" 19 "fmt" 20 "os" 21 gtime "time" 22 23 specs "github.com/opencontainers/runtime-spec/specs-go" 24 "golang.org/x/sys/unix" 25 "github.com/SagerNet/gvisor/pkg/control/server" 26 "github.com/SagerNet/gvisor/pkg/fd" 27 "github.com/SagerNet/gvisor/pkg/log" 28 "github.com/SagerNet/gvisor/pkg/sentry/control" 29 "github.com/SagerNet/gvisor/pkg/sentry/fs" 30 "github.com/SagerNet/gvisor/pkg/sentry/kernel" 31 "github.com/SagerNet/gvisor/pkg/sentry/socket/netstack" 32 "github.com/SagerNet/gvisor/pkg/sentry/state" 33 "github.com/SagerNet/gvisor/pkg/sentry/time" 34 "github.com/SagerNet/gvisor/pkg/sentry/vfs" 35 "github.com/SagerNet/gvisor/pkg/sentry/watchdog" 36 "github.com/SagerNet/gvisor/pkg/tcpip/stack" 37 "github.com/SagerNet/gvisor/pkg/urpc" 38 "github.com/SagerNet/gvisor/runsc/boot/pprof" 39 "github.com/SagerNet/gvisor/runsc/config" 40 "github.com/SagerNet/gvisor/runsc/specutils" 41 ) 42 43 const ( 44 // ContainerCheckpoint checkpoints a container. 45 ContainerCheckpoint = "containerManager.Checkpoint" 46 47 // ContainerCreate creates a container. 48 ContainerCreate = "containerManager.Create" 49 50 // ContainerDestroy is used to stop a non-root container and free all 51 // associated resources in the sandbox. 52 ContainerDestroy = "containerManager.Destroy" 53 54 // ContainerEvent is the URPC endpoint for getting stats about the 55 // container used by "runsc events". 56 ContainerEvent = "containerManager.Event" 57 58 // ContainerExecuteAsync is the URPC endpoint for executing a command in a 59 // container. 60 ContainerExecuteAsync = "containerManager.ExecuteAsync" 61 62 // ContainerPause pauses the container. 63 ContainerPause = "containerManager.Pause" 64 65 // ContainerProcesses is the URPC endpoint for getting the list of 66 // processes running in a container. 67 ContainerProcesses = "containerManager.Processes" 68 69 // ContainerRestore restores a container from a statefile. 70 ContainerRestore = "containerManager.Restore" 71 72 // ContainerResume unpauses the paused container. 73 ContainerResume = "containerManager.Resume" 74 75 // ContainerSignal is used to send a signal to a container. 76 ContainerSignal = "containerManager.Signal" 77 78 // ContainerSignalProcess is used to send a signal to a particular 79 // process in a container. 80 ContainerSignalProcess = "containerManager.SignalProcess" 81 82 // ContainerStart is the URPC endpoint for running a non-root container 83 // within a sandbox. 84 ContainerStart = "containerManager.Start" 85 86 // ContainerWait is used to wait on the init process of the container 87 // and return its ExitStatus. 88 ContainerWait = "containerManager.Wait" 89 90 // ContainerWaitPID is used to wait on a process with a certain PID in 91 // the sandbox and return its ExitStatus. 92 ContainerWaitPID = "containerManager.WaitPID" 93 94 // NetworkCreateLinksAndRoutes is the URPC endpoint for creating links 95 // and routes in a network stack. 96 NetworkCreateLinksAndRoutes = "Network.CreateLinksAndRoutes" 97 98 // RootContainerStart is the URPC endpoint for starting a new sandbox 99 // with root container. 100 RootContainerStart = "containerManager.StartRoot" 101 102 // SandboxStacks collects sandbox stacks for debugging. 103 SandboxStacks = "debug.Stacks" 104 ) 105 106 // Profiling related commands (see pprof.go for more details). 107 const ( 108 CPUProfile = "Profile.CPU" 109 HeapProfile = "Profile.Heap" 110 BlockProfile = "Profile.Block" 111 MutexProfile = "Profile.Mutex" 112 Trace = "Profile.Trace" 113 ) 114 115 // Logging related commands (see logging.go for more details). 116 const ( 117 ChangeLogging = "Logging.Change" 118 ) 119 120 // ControlSocketAddr generates an abstract unix socket name for the given ID. 121 func ControlSocketAddr(id string) string { 122 return fmt.Sprintf("\x00runsc-sandbox.%s", id) 123 } 124 125 // controller holds the control server, and is used for communication into the 126 // sandbox. 127 type controller struct { 128 // srv is the control server. 129 srv *server.Server 130 131 // manager holds the containerManager methods. 132 manager *containerManager 133 } 134 135 // newController creates a new controller. The caller must call 136 // controller.srv.StartServing() to start the controller. 137 func newController(fd int, l *Loader) (*controller, error) { 138 ctrl := &controller{} 139 var err error 140 ctrl.srv, err = server.CreateFromFD(fd) 141 if err != nil { 142 return nil, err 143 } 144 145 ctrl.manager = &containerManager{ 146 startChan: make(chan struct{}), 147 startResultChan: make(chan error), 148 l: l, 149 } 150 ctrl.srv.Register(ctrl.manager) 151 152 if eps, ok := l.k.RootNetworkNamespace().Stack().(*netstack.Stack); ok { 153 net := &Network{ 154 Stack: eps.Stack, 155 } 156 ctrl.srv.Register(net) 157 } 158 159 ctrl.srv.Register(&debug{}) 160 ctrl.srv.Register(&control.Logging{}) 161 162 if l.root.conf.ProfileEnable { 163 ctrl.srv.Register(control.NewProfile(l.k)) 164 } 165 166 return ctrl, nil 167 } 168 169 // stopRPCTimeout is the time for clients to complete ongoing RPCs. 170 const stopRPCTimeout = 15 * gtime.Second 171 172 func (c *controller) stop() { 173 c.srv.Stop(stopRPCTimeout) 174 } 175 176 // containerManager manages sandbox containers. 177 type containerManager struct { 178 // startChan is used to signal when the root container process should 179 // be started. 180 startChan chan struct{} 181 182 // startResultChan is used to signal when the root container has 183 // started. Any errors encountered during startup will be sent to the 184 // channel. A nil value indicates success. 185 startResultChan chan error 186 187 // l is the loader that creates containers and sandboxes. 188 l *Loader 189 } 190 191 // StartRoot will start the root container process. 192 func (cm *containerManager) StartRoot(cid *string, _ *struct{}) error { 193 log.Debugf("containerManager.StartRoot, cid: %s", *cid) 194 // Tell the root container to start and wait for the result. 195 cm.startChan <- struct{}{} 196 if err := <-cm.startResultChan; err != nil { 197 return fmt.Errorf("starting sandbox: %v", err) 198 } 199 return nil 200 } 201 202 // Processes retrieves information about processes running in the sandbox. 203 func (cm *containerManager) Processes(cid *string, out *[]*control.Process) error { 204 log.Debugf("containerManager.Processes, cid: %s", *cid) 205 return control.Processes(cm.l.k, *cid, out) 206 } 207 208 // CreateArgs contains arguments to the Create method. 209 type CreateArgs struct { 210 // CID is the ID of the container to start. 211 CID string 212 213 // FilePayload may contain a TTY file for the terminal, if enabled. 214 urpc.FilePayload 215 } 216 217 // Create creates a container within a sandbox. 218 func (cm *containerManager) Create(args *CreateArgs, _ *struct{}) error { 219 log.Debugf("containerManager.Create: %s", args.CID) 220 221 if len(args.Files) > 1 { 222 return fmt.Errorf("start arguments must have at most 1 files for TTY") 223 } 224 var tty *fd.FD 225 if len(args.Files) == 1 { 226 var err error 227 tty, err = fd.NewFromFile(args.Files[0]) 228 if err != nil { 229 return fmt.Errorf("error dup'ing TTY file: %w", err) 230 } 231 } 232 return cm.l.createContainer(args.CID, tty) 233 } 234 235 // StartArgs contains arguments to the Start method. 236 type StartArgs struct { 237 // Spec is the spec of the container to start. 238 Spec *specs.Spec 239 240 // Config is the runsc-specific configuration for the sandbox. 241 Conf *config.Config 242 243 // CID is the ID of the container to start. 244 CID string 245 246 // FilePayload contains, in order: 247 // * stdin, stdout, and stderr (optional: if terminal is disabled). 248 // * file descriptors to connect to gofer to serve the root filesystem. 249 urpc.FilePayload 250 } 251 252 // Start runs a created container within a sandbox. 253 func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error { 254 // Validate arguments. 255 if args == nil { 256 return errors.New("start missing arguments") 257 } 258 log.Debugf("containerManager.Start, cid: %s, args: %+v", args.CID, args) 259 if args.Spec == nil { 260 return errors.New("start arguments missing spec") 261 } 262 if args.Conf == nil { 263 return errors.New("start arguments missing config") 264 } 265 if args.CID == "" { 266 return errors.New("start argument missing container ID") 267 } 268 if len(args.Files) < 1 { 269 return fmt.Errorf("start arguments must contain at least one file for the container root gofer") 270 } 271 272 // All validation passed, logs the spec for debugging. 273 specutils.LogSpec(args.Spec) 274 275 goferFiles := args.Files 276 var stdios []*fd.FD 277 if !args.Spec.Process.Terminal { 278 // When not using a terminal, stdios come as the first 3 files in the 279 // payload. 280 if l := len(args.Files); l < 4 { 281 return fmt.Errorf("start arguments (len: %d) must contain stdios and files for the container root gofer", l) 282 } 283 var err error 284 stdios, err = fd.NewFromFiles(goferFiles[:3]) 285 if err != nil { 286 return fmt.Errorf("error dup'ing stdio files: %w", err) 287 } 288 goferFiles = goferFiles[3:] 289 } 290 defer func() { 291 for _, fd := range stdios { 292 _ = fd.Close() 293 } 294 }() 295 296 goferFDs, err := fd.NewFromFiles(goferFiles) 297 if err != nil { 298 return fmt.Errorf("error dup'ing gofer files: %w", err) 299 } 300 defer func() { 301 for _, fd := range goferFDs { 302 _ = fd.Close() 303 } 304 }() 305 306 if err := cm.l.startContainer(args.Spec, args.Conf, args.CID, stdios, goferFDs); err != nil { 307 log.Debugf("containerManager.Start failed, cid: %s, args: %+v, err: %v", args.CID, args, err) 308 return err 309 } 310 log.Debugf("Container started, cid: %s", args.CID) 311 return nil 312 } 313 314 // Destroy stops a container if it is still running and cleans up its 315 // filesystem. 316 func (cm *containerManager) Destroy(cid *string, _ *struct{}) error { 317 log.Debugf("containerManager.destroy, cid: %s", *cid) 318 return cm.l.destroyContainer(*cid) 319 } 320 321 // ExecuteAsync starts running a command on a created or running sandbox. It 322 // returns the PID of the new process. 323 func (cm *containerManager) ExecuteAsync(args *control.ExecArgs, pid *int32) error { 324 log.Debugf("containerManager.ExecuteAsync, cid: %s, args: %+v", args.ContainerID, args) 325 tgid, err := cm.l.executeAsync(args) 326 if err != nil { 327 log.Debugf("containerManager.ExecuteAsync failed, cid: %s, args: %+v, err: %v", args.ContainerID, args, err) 328 return err 329 } 330 *pid = int32(tgid) 331 return nil 332 } 333 334 // Checkpoint pauses a sandbox and saves its state. 335 func (cm *containerManager) Checkpoint(o *control.SaveOpts, _ *struct{}) error { 336 log.Debugf("containerManager.Checkpoint") 337 // TODO(github.com/SagerNet/issues/6243): save/restore not supported w/ hostinet 338 if cm.l.root.conf.Network == config.NetworkHost { 339 return errors.New("checkpoint not supported when using hostinet") 340 } 341 342 state := control.State{ 343 Kernel: cm.l.k, 344 Watchdog: cm.l.watchdog, 345 } 346 return state.Save(o, nil) 347 } 348 349 // Pause suspends a container. 350 func (cm *containerManager) Pause(_, _ *struct{}) error { 351 log.Debugf("containerManager.Pause") 352 // TODO(github.com/SagerNet/issues/6243): save/restore not supported w/ hostinet 353 if cm.l.root.conf.Network == config.NetworkHost { 354 return errors.New("pause not supported when using hostinet") 355 } 356 cm.l.k.Pause() 357 return nil 358 } 359 360 // RestoreOpts contains options related to restoring a container's file system. 361 type RestoreOpts struct { 362 // FilePayload contains the state file to be restored, followed by the 363 // platform device file if necessary. 364 urpc.FilePayload 365 366 // SandboxID contains the ID of the sandbox. 367 SandboxID string 368 } 369 370 // Restore loads a container from a statefile. 371 // The container's current kernel is destroyed, a restore environment is 372 // created, and the kernel is recreated with the restore state file. The 373 // container then sends the signal to start. 374 func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { 375 log.Debugf("containerManager.Restore") 376 377 var specFile, deviceFile *os.File 378 switch numFiles := len(o.Files); numFiles { 379 case 2: 380 // The device file is donated to the platform. 381 // Can't take ownership away from os.File. dup them to get a new FD. 382 fd, err := unix.Dup(int(o.Files[1].Fd())) 383 if err != nil { 384 return fmt.Errorf("failed to dup file: %v", err) 385 } 386 deviceFile = os.NewFile(uintptr(fd), "platform device") 387 fallthrough 388 case 1: 389 specFile = o.Files[0] 390 case 0: 391 return fmt.Errorf("at least one file must be passed to Restore") 392 default: 393 return fmt.Errorf("at most two files may be passed to Restore") 394 } 395 396 // Pause the kernel while we build a new one. 397 cm.l.k.Pause() 398 399 p, err := createPlatform(cm.l.root.conf, deviceFile) 400 if err != nil { 401 return fmt.Errorf("creating platform: %v", err) 402 } 403 k := &kernel.Kernel{ 404 Platform: p, 405 } 406 mf, err := createMemoryFile() 407 if err != nil { 408 return fmt.Errorf("creating memory file: %v", err) 409 } 410 k.SetMemoryFile(mf) 411 networkStack := cm.l.k.RootNetworkNamespace().Stack() 412 cm.l.k = k 413 414 // Set up the restore environment. 415 ctx := k.SupervisorContext() 416 mntr := newContainerMounter(&cm.l.root, cm.l.k, cm.l.mountHints, kernel.VFS2Enabled) 417 if kernel.VFS2Enabled { 418 ctx, err = mntr.configureRestore(ctx) 419 if err != nil { 420 return fmt.Errorf("configuring filesystem restore: %v", err) 421 } 422 } else { 423 renv, err := mntr.createRestoreEnvironment(cm.l.root.conf) 424 if err != nil { 425 return fmt.Errorf("creating RestoreEnvironment: %v", err) 426 } 427 fs.SetRestoreEnvironment(*renv) 428 } 429 430 // Prepare to load from the state file. 431 if eps, ok := networkStack.(*netstack.Stack); ok { 432 stack.StackFromEnv = eps.Stack // FIXME(b/36201077) 433 } 434 info, err := specFile.Stat() 435 if err != nil { 436 return err 437 } 438 if info.Size() == 0 { 439 return fmt.Errorf("file cannot be empty") 440 } 441 442 if cm.l.root.conf.ProfileEnable { 443 // pprof.Initialize opens /proc/self/maps, so has to be called before 444 // installing seccomp filters. 445 pprof.Initialize() 446 } 447 448 // Seccomp filters have to be applied before parsing the state file. 449 if err := cm.l.installSeccompFilters(); err != nil { 450 return err 451 } 452 453 // Load the state. 454 loadOpts := state.LoadOpts{Source: specFile} 455 if err := loadOpts.Load(ctx, k, nil, networkStack, time.NewCalibratedClocks(), &vfs.CompleteRestoreOptions{}); err != nil { 456 return err 457 } 458 459 // Since we have a new kernel we also must make a new watchdog. 460 dogOpts := watchdog.DefaultOpts 461 dogOpts.TaskTimeoutAction = cm.l.root.conf.WatchdogAction 462 dog := watchdog.New(k, dogOpts) 463 464 // Change the loader fields to reflect the changes made when restoring. 465 cm.l.k = k 466 cm.l.watchdog = dog 467 cm.l.root.procArgs = kernel.CreateProcessArgs{} 468 cm.l.restore = true 469 470 // Reinitialize the sandbox ID and processes map. Note that it doesn't 471 // restore the state of multiple containers, nor exec processes. 472 cm.l.sandboxID = o.SandboxID 473 cm.l.mu.Lock() 474 eid := execID{cid: o.SandboxID} 475 cm.l.processes = map[execID]*execProcess{ 476 eid: { 477 tg: cm.l.k.GlobalInit(), 478 }, 479 } 480 cm.l.mu.Unlock() 481 482 // Tell the root container to start and wait for the result. 483 cm.startChan <- struct{}{} 484 if err := <-cm.startResultChan; err != nil { 485 return fmt.Errorf("starting sandbox: %v", err) 486 } 487 488 return nil 489 } 490 491 // Resume unpauses a container. 492 func (cm *containerManager) Resume(_, _ *struct{}) error { 493 log.Debugf("containerManager.Resume") 494 cm.l.k.Unpause() 495 return nil 496 } 497 498 // Wait waits for the init process in the given container. 499 func (cm *containerManager) Wait(cid *string, waitStatus *uint32) error { 500 log.Debugf("containerManager.Wait, cid: %s", *cid) 501 err := cm.l.waitContainer(*cid, waitStatus) 502 log.Debugf("containerManager.Wait returned, cid: %s, waitStatus: %#x, err: %v", *cid, *waitStatus, err) 503 return err 504 } 505 506 // WaitPIDArgs are arguments to the WaitPID method. 507 type WaitPIDArgs struct { 508 // PID is the PID in the container's PID namespace. 509 PID int32 510 511 // CID is the container ID. 512 CID string 513 } 514 515 // WaitPID waits for the process with PID 'pid' in the sandbox. 516 func (cm *containerManager) WaitPID(args *WaitPIDArgs, waitStatus *uint32) error { 517 log.Debugf("containerManager.Wait, cid: %s, pid: %d", args.CID, args.PID) 518 err := cm.l.waitPID(kernel.ThreadID(args.PID), args.CID, waitStatus) 519 log.Debugf("containerManager.Wait, cid: %s, pid: %d, waitStatus: %#x, err: %v", args.CID, args.PID, *waitStatus, err) 520 return err 521 } 522 523 // SignalDeliveryMode enumerates different signal delivery modes. 524 type SignalDeliveryMode int 525 526 const ( 527 // DeliverToProcess delivers the signal to the container process with 528 // the specified PID. If PID is 0, then the container init process is 529 // signaled. 530 DeliverToProcess SignalDeliveryMode = iota 531 532 // DeliverToAllProcesses delivers the signal to all processes in the 533 // container. PID must be 0. 534 DeliverToAllProcesses 535 536 // DeliverToForegroundProcessGroup delivers the signal to the 537 // foreground process group in the same TTY session as the specified 538 // process. If PID is 0, then the signal is delivered to the foreground 539 // process group for the TTY for the init process. 540 DeliverToForegroundProcessGroup 541 ) 542 543 func (s SignalDeliveryMode) String() string { 544 switch s { 545 case DeliverToProcess: 546 return "Process" 547 case DeliverToAllProcesses: 548 return "All" 549 case DeliverToForegroundProcessGroup: 550 return "Foreground Process Group" 551 } 552 return fmt.Sprintf("unknown signal delivery mode: %d", s) 553 } 554 555 // SignalArgs are arguments to the Signal method. 556 type SignalArgs struct { 557 // CID is the container ID. 558 CID string 559 560 // Signo is the signal to send to the process. 561 Signo int32 562 563 // PID is the process ID in the given container that will be signaled, 564 // relative to the root PID namespace, not the container's. 565 // If 0, the root container will be signalled. 566 PID int32 567 568 // Mode is the signal delivery mode. 569 Mode SignalDeliveryMode 570 } 571 572 // Signal sends a signal to one or more processes in a container. If args.PID 573 // is 0, then the container init process is used. Depending on the 574 // args.SignalDeliveryMode option, the signal may be sent directly to the 575 // indicated process, to all processes in the container, or to the foreground 576 // process group. 577 func (cm *containerManager) Signal(args *SignalArgs, _ *struct{}) error { 578 log.Debugf("containerManager.Signal: cid: %s, PID: %d, signal: %d, mode: %v", args.CID, args.PID, args.Signo, args.Mode) 579 return cm.l.signal(args.CID, args.PID, args.Signo, args.Mode) 580 }