github.com/criyle/go-sandbox@v0.10.3/container/container_init_linux.go (about)

     1  package container
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"os"
     7  	"os/exec"
     8  	"os/signal"
     9  	"path/filepath"
    10  	"runtime"
    11  	"strconv"
    12  	"sync"
    13  	"syscall"
    14  
    15  	"github.com/criyle/go-sandbox/pkg/unixsocket"
    16  )
    17  
    18  type containerServer struct {
    19  	socket *socket
    20  	containerConfig
    21  	defaultEnv []string
    22  
    23  	done     chan struct{}
    24  	err      error
    25  	doneOnce sync.Once
    26  
    27  	recvCh chan recvCmd
    28  	sendCh chan sendReply
    29  
    30  	waitPid       chan int
    31  	waitPidResult chan waitPidResult
    32  
    33  	waitAll     chan struct{}
    34  	waitAllDone chan struct{}
    35  }
    36  
    37  type recvCmd struct {
    38  	Cmd cmd
    39  	Msg unixsocket.Msg
    40  }
    41  
    42  type sendReply struct {
    43  	Reply       reply
    44  	Msg         unixsocket.Msg
    45  	FileToClose []*os.File
    46  }
    47  
    48  type waitPidResult struct {
    49  	WaitStatus syscall.WaitStatus
    50  	Rusage     syscall.Rusage
    51  	Err        error
    52  }
    53  
    54  // Init is called for container init process
    55  // it will check if pid == 1, otherwise it is noop
    56  // Init will do infinite loop on socket commands,
    57  // and exits when at socket close, use it in init function
    58  func Init() (err error) {
    59  	// noop if self is not container init process
    60  	// Notice: docker init is also 1, additional check for args[1] == init
    61  	if os.Getpid() != 1 || len(os.Args) < 2 || os.Args[1] != initArg {
    62  		return nil
    63  	}
    64  
    65  	// exit process (with whole container) upon exit this function
    66  	// possible reason:
    67  	// 1. socket broken (parent exit)
    68  	// 2. panic
    69  	// 3. undefined cmd (possible race condition)
    70  	defer func() {
    71  		if err := recover(); err != nil {
    72  			fmt.Fprintf(os.Stderr, "container_exit: panic: %v\n", err)
    73  			os.Exit(1)
    74  		}
    75  		if err != nil {
    76  			fmt.Fprintf(os.Stderr, "container_exit: %v\n", err)
    77  			os.Exit(1)
    78  		}
    79  		fmt.Fprintf(os.Stderr, "container_exit\n")
    80  		os.Exit(0)
    81  	}()
    82  
    83  	// ignore any signal that kills the init process
    84  	ignoreSignals()
    85  
    86  	// limit container resource usage
    87  	runtime.GOMAXPROCS(containerMaxProc)
    88  
    89  	// ensure there's no fd leak to child process (e.g. VSCode leaks ptmx fd)
    90  	if err := closeOnExecAllFds(); err != nil {
    91  		return fmt.Errorf("container_init: failed to close_on_exec all fd %v", err)
    92  	}
    93  
    94  	// new_container environment shared the socket at fd 3 (marked close_exec)
    95  	const defaultFd = 3
    96  	soc, err := unixsocket.NewSocket(defaultFd)
    97  	if err != nil {
    98  		return fmt.Errorf("container_init: failed to new socket %v", err)
    99  	}
   100  
   101  	// serve forever
   102  	cs := &containerServer{
   103  		socket:        newSocket(soc),
   104  		done:          make(chan struct{}),
   105  		sendCh:        make(chan sendReply, 1),
   106  		recvCh:        make(chan recvCmd, 1),
   107  		waitPid:       make(chan int),
   108  		waitAll:       make(chan struct{}),
   109  		waitPidResult: make(chan waitPidResult, 1),
   110  		waitAllDone:   make(chan struct{}, 1),
   111  	}
   112  	go cs.sendLoop()
   113  	go cs.recvLoop()
   114  	go cs.waitLoop()
   115  
   116  	return cs.serve()
   117  }
   118  
   119  func (c *containerServer) sendLoop() {
   120  	for {
   121  		select {
   122  		case <-c.done:
   123  			return
   124  
   125  		case rep, ok := <-c.sendCh:
   126  			if !ok {
   127  				return
   128  			}
   129  			err := c.socket.SendMsg(rep.Reply, rep.Msg)
   130  			for _, f := range rep.FileToClose {
   131  				f.Close()
   132  			}
   133  			if err != nil {
   134  				c.socketError(err)
   135  				return
   136  			}
   137  		}
   138  	}
   139  }
   140  
   141  func (c *containerServer) recvLoop() {
   142  	for {
   143  		var cmd cmd
   144  		msg, err := c.socket.RecvMsg(&cmd)
   145  		if err != nil {
   146  			c.socketError(err)
   147  			return
   148  		}
   149  		c.recvCh <- recvCmd{
   150  			Cmd: cmd,
   151  			Msg: msg,
   152  		}
   153  	}
   154  }
   155  
   156  func (c *containerServer) socketError(err error) {
   157  	c.doneOnce.Do(func() {
   158  		c.err = err
   159  		close(c.done)
   160  	})
   161  }
   162  
   163  func (c *containerServer) waitLoop() {
   164  	for {
   165  		select {
   166  		case pid := <-c.waitPid:
   167  			var waitStatus syscall.WaitStatus
   168  			var rusage syscall.Rusage
   169  
   170  			_, err := syscall.Wait4(pid, &waitStatus, 0, &rusage)
   171  			for err == syscall.EINTR {
   172  				_, err = syscall.Wait4(pid, &waitStatus, 0, &rusage)
   173  			}
   174  			if err != nil {
   175  				c.waitPidResult <- waitPidResult{
   176  					Err: err,
   177  				}
   178  				continue
   179  			}
   180  			c.waitPidResult <- waitPidResult{
   181  				WaitStatus: waitStatus,
   182  				Rusage:     rusage,
   183  			}
   184  
   185  		case <-c.waitAll:
   186  			for {
   187  				if _, err := syscall.Wait4(-1, nil, syscall.WNOHANG, nil); err != nil && err != syscall.EINTR {
   188  					break
   189  				}
   190  			}
   191  			c.waitAllDone <- struct{}{}
   192  		}
   193  	}
   194  }
   195  
   196  func (c *containerServer) serve() error {
   197  	for {
   198  		cmd, msg, err := c.recvCmd()
   199  		if err != nil {
   200  			return fmt.Errorf("serve: recvCmd %v", err)
   201  		}
   202  		if err := c.handleCmd(cmd, msg); err != nil {
   203  			return fmt.Errorf("serve: failed to execute cmd %v", err)
   204  		}
   205  	}
   206  }
   207  
   208  func (c *containerServer) handleCmd(cmd cmd, msg unixsocket.Msg) error {
   209  	switch cmd.Cmd {
   210  	case cmdPing:
   211  		return c.handlePing()
   212  
   213  	case cmdConf:
   214  		return c.handleConf(cmd.ConfCmd)
   215  
   216  	case cmdOpen:
   217  		return c.handleOpen(cmd.OpenCmd)
   218  
   219  	case cmdDelete:
   220  		return c.handleDelete(cmd.DeleteCmd)
   221  
   222  	case cmdReset:
   223  		return c.handleReset()
   224  
   225  	case cmdExecve:
   226  		return c.handleExecve(cmd.ExecCmd, msg)
   227  	}
   228  	return fmt.Errorf("unknown command: %v", cmd.Cmd)
   229  }
   230  
   231  func initContainer(c containerConfig) error {
   232  	if err := initFileSystem(c); err != nil {
   233  		return err
   234  	}
   235  	if err := syscall.Setdomainname([]byte(c.DomainName)); err != nil {
   236  		return err
   237  	}
   238  	if err := syscall.Sethostname([]byte(c.HostName)); err != nil {
   239  		return err
   240  	}
   241  	if err := os.Chdir(c.WorkDir); err != nil {
   242  		return err
   243  	}
   244  	if len(c.InitCommand) > 0 {
   245  		cm := exec.Command(c.InitCommand[0], c.InitCommand[1:]...)
   246  		if output, err := cm.CombinedOutput(); err != nil {
   247  			os.Stderr.Write(output)
   248  			return err
   249  		}
   250  	}
   251  	return nil
   252  }
   253  
   254  func initFileSystem(c containerConfig) error {
   255  	// mount tmpfs as root
   256  	const tmpfs = "tmpfs"
   257  	if err := syscall.Mount(tmpfs, c.ContainerRoot, tmpfs, 0, ""); err != nil {
   258  		return fmt.Errorf("init_fs: mount / %v", err)
   259  	}
   260  	// change dir to container root
   261  	if err := syscall.Chdir(c.ContainerRoot); err != nil {
   262  		return fmt.Errorf("init_fs: chdir %v", err)
   263  	}
   264  	// performing mounts
   265  	for _, m := range c.Mounts {
   266  		if err := m.Mount(); err != nil {
   267  			return fmt.Errorf("init_fs: mount %v %v", m, err)
   268  		}
   269  	}
   270  	// pivot root
   271  	const oldRoot = "old_root"
   272  	if err := os.Mkdir(oldRoot, 0755); err != nil {
   273  		return fmt.Errorf("init_fs: mkdir(old_root) %v", err)
   274  	}
   275  	if err := syscall.PivotRoot(c.ContainerRoot, oldRoot); err != nil {
   276  		return fmt.Errorf("init_fs: pivot_root(%s, %s) %v", c.ContainerRoot, oldRoot, err)
   277  	}
   278  	if err := syscall.Unmount(oldRoot, syscall.MNT_DETACH); err != nil {
   279  		return fmt.Errorf("init_fs: unmount(old_root) %v", err)
   280  	}
   281  	if err := os.Remove(oldRoot); err != nil {
   282  		return fmt.Errorf("init_fs: unlink(old_root) %v", err)
   283  	}
   284  	// create symlinks
   285  	for _, l := range c.SymbolicLinks {
   286  		// ensure dir exists
   287  		dir := filepath.Dir(l.LinkPath)
   288  		if err := os.MkdirAll(dir, 0755); err != nil {
   289  			return fmt.Errorf("init_fs: mkdir_all(%s) %v", dir, err)
   290  		}
   291  		if err := os.Symlink(l.Target, l.LinkPath); err != nil {
   292  			return fmt.Errorf("init_fs: symlink %v", err)
   293  		}
   294  	}
   295  	// mask paths
   296  	for _, p := range c.MaskPaths {
   297  		if err := maskPath(p); err != nil {
   298  			return fmt.Errorf("init_fs: mask path %v", err)
   299  		}
   300  	}
   301  	// readonly root
   302  	const remountFlag = syscall.MS_BIND | syscall.MS_REMOUNT | syscall.MS_RDONLY | syscall.MS_NOATIME | syscall.MS_NOSUID
   303  	if err := syscall.Mount(tmpfs, "/", tmpfs, remountFlag, ""); err != nil {
   304  		return fmt.Errorf("init_fs: readonly remount / %v", err)
   305  	}
   306  	return nil
   307  }
   308  
   309  func (c *containerServer) recvCmd() (cmd, unixsocket.Msg, error) {
   310  	select {
   311  	case <-c.done:
   312  		return cmd{}, unixsocket.Msg{}, c.err
   313  
   314  	case recv := <-c.recvCh:
   315  		return recv.Cmd, recv.Msg, nil
   316  	}
   317  }
   318  
   319  func (c *containerServer) sendReplyFiles(rep reply, msg unixsocket.Msg, fileToClose []*os.File) error {
   320  	select {
   321  	case <-c.done:
   322  		return c.err
   323  
   324  	case c.sendCh <- sendReply{Reply: rep, Msg: msg, FileToClose: fileToClose}:
   325  		return nil
   326  	}
   327  }
   328  
   329  func (c *containerServer) sendReply(rep reply, msg unixsocket.Msg) error {
   330  	return c.sendReplyFiles(rep, msg, nil)
   331  }
   332  
   333  // sendErrorReply sends error reply
   334  func (c *containerServer) sendErrorReply(ft string, v ...interface{}) error {
   335  	errRep := &errorReply{
   336  		Msg: fmt.Sprintf(ft, v...),
   337  	}
   338  	// store errno
   339  	if len(v) == 1 {
   340  		if errno, ok := v[0].(syscall.Errno); ok {
   341  			errRep.Errno = &errno
   342  		}
   343  	}
   344  	return c.sendReply(reply{Error: errRep}, unixsocket.Msg{})
   345  }
   346  
   347  func closeOnExecAllFds() error {
   348  	// get all fd from /proc/self/fd
   349  	const fdPath = "/proc/self/fd"
   350  	fds, err := os.ReadDir(fdPath)
   351  	if err != nil {
   352  		return err
   353  	}
   354  	for _, f := range fds {
   355  		fd, err := strconv.Atoi(f.Name())
   356  		if err != nil {
   357  			return err
   358  		}
   359  		syscall.CloseOnExec(fd)
   360  	}
   361  	return nil
   362  }
   363  
   364  func maskPath(path string) error {
   365  	// bind mount /dev/null if it is file
   366  	if err := syscall.Mount("/dev/null", path, "", syscall.MS_BIND, ""); err != nil && !errors.Is(err, os.ErrNotExist) {
   367  		if errors.Is(err, syscall.ENOTDIR) {
   368  			// otherwise, mount tmpfs to mask it
   369  			return syscall.Mount("tmpfs", path, "tmpfs", syscall.MS_RDONLY, "")
   370  		}
   371  		return err
   372  	}
   373  	return nil
   374  }
   375  
   376  func ignoreSignals() {
   377  	signal.Ignore(signalToIgnore...)
   378  }