github.com/criyle/go-sandbox@v0.10.3/container/environment_linux.go (about)

     1  package container
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"io"
     7  	"os"
     8  	"os/exec"
     9  	"sync"
    10  	"syscall"
    11  
    12  	"github.com/criyle/go-sandbox/pkg/forkexec"
    13  	"github.com/criyle/go-sandbox/pkg/mount"
    14  	"github.com/criyle/go-sandbox/pkg/unixsocket"
    15  	"github.com/criyle/go-sandbox/runner"
    16  	"golang.org/x/sys/unix"
    17  )
    18  
    19  // PathEnv defines path environment variable for the container init process
    20  const PathEnv = "PATH=/usr/local/bin:/usr/bin:/bin"
    21  
    22  // Builder builds instance of container environment
    23  type Builder struct {
    24  	// Root is container root mount path, empty uses current work path
    25  	Root string
    26  
    27  	// TmpRoot defines the tmp dir pattern if not nil. Temp directory will be created as container root dir
    28  	TmpRoot string
    29  
    30  	// Mounts defines container mount points, empty uses default mounts
    31  	Mounts []mount.Mount
    32  
    33  	// SymbolicLinks defines symlinks to be created after mount file system
    34  	SymbolicLinks []SymbolicLink
    35  
    36  	// MaskPaths defines paths to be masked to avoid reading information from
    37  	// outside of the container
    38  	MaskPaths []string
    39  
    40  	// WorkDir defines container default work directory (default: /w)
    41  	WorkDir string
    42  
    43  	// Stderr defines whether to dup container stderr to stderr for debug
    44  	Stderr io.Writer
    45  
    46  	// ExecFile defines executable that called Init, otherwise defer current
    47  	// executable (/proc/self/exe)
    48  	ExecFile string
    49  
    50  	// CredGenerator defines a credential generator used to create new container
    51  	CredGenerator CredGenerator
    52  
    53  	// Clone flags defines unshare clone flag to create container
    54  	CloneFlags uintptr
    55  
    56  	// HostName set container hostname (default: go-sandbox)
    57  	HostName string
    58  
    59  	// DomainName set container domainname (default: go-sandbox)
    60  	DomainName string
    61  
    62  	// InitCommand defines command that runs after the initialization of the container
    63  	// to do additional setups (for example, loopback network)
    64  	InitCommand []string
    65  
    66  	// ContainerUID & ContainerGID set the container uid / gid mapping
    67  	ContainerUID int
    68  	ContainerGID int
    69  }
    70  
    71  // SymbolicLink defines symlinks to be created after mount
    72  type SymbolicLink struct {
    73  	LinkPath string
    74  	Target   string
    75  }
    76  
    77  // CredGenerator generates uid / gid credential used by container
    78  // to isolate process and file system access
    79  type CredGenerator interface {
    80  	Get() syscall.Credential
    81  }
    82  
    83  // Environment holds single progrem containerized environment
    84  type Environment interface {
    85  	Ping() error
    86  	Open([]OpenCmd) ([]*os.File, error)
    87  	Delete(p string) error
    88  	Reset() error
    89  	Execve(context.Context, ExecveParam) runner.Result
    90  	Destroy() error
    91  }
    92  
    93  // container manages single pre-forked container environment
    94  type container struct {
    95  	process *os.Process // underlying container init pid
    96  	socket  *socket     // host - container communication
    97  	mu      sync.Mutex  // lock to avoid race condition
    98  
    99  	done     chan struct{}
   100  	err      error
   101  	doneOnce sync.Once
   102  
   103  	recvCh chan recvReply
   104  	sendCh chan sendCmd
   105  }
   106  
   107  type recvReply struct {
   108  	Reply reply
   109  	Msg   unixsocket.Msg
   110  }
   111  
   112  type sendCmd struct {
   113  	Cmd cmd
   114  	Msg unixsocket.Msg
   115  }
   116  
   117  // Build creates new environment with underlying container
   118  func (b *Builder) Build() (Environment, error) {
   119  	c, err := b.startContainer()
   120  	if err != nil {
   121  		return nil, err
   122  	}
   123  
   124  	// avoid non cinit enabled executable running as container init process
   125  	if err = c.Ping(); err != nil {
   126  		c.Destroy()
   127  		return nil, fmt.Errorf("container: container init not responding to ping %v", err)
   128  	}
   129  
   130  	// container mount points
   131  	mounts := b.Mounts
   132  	if len(mounts) == 0 {
   133  		mounts = mount.NewDefaultBuilder().
   134  			WithTmpfs("w", "").   // work dir
   135  			WithTmpfs("tmp", ""). // tmp
   136  			FilterNotExist().Mounts
   137  	}
   138  
   139  	// container symbolic links
   140  	links := b.SymbolicLinks
   141  	if len(links) == 0 {
   142  		links = defaultSymLinks
   143  	}
   144  
   145  	maskPaths := b.MaskPaths
   146  	if len(maskPaths) == 0 {
   147  		maskPaths = defaultMaskPaths
   148  	}
   149  
   150  	// container root directory on the host
   151  	root := b.Root
   152  	if b.TmpRoot != "" {
   153  		if root, err = os.MkdirTemp(b.Root, b.TmpRoot); err != nil {
   154  			return nil, fmt.Errorf("container: failed to make tmp container root at %s %v", b.Root, err)
   155  		}
   156  		defer os.Remove(root)
   157  	}
   158  	if root == "" {
   159  		if root, err = os.Getwd(); err != nil {
   160  			return nil, fmt.Errorf("container: failed to get work directory %v", err)
   161  		}
   162  	}
   163  	workDir := containerWD
   164  	if b.WorkDir != "" {
   165  		workDir = b.WorkDir
   166  	}
   167  	hostName := containerName
   168  	if b.HostName != "" {
   169  		hostName = b.HostName
   170  	}
   171  	domainName := containerName
   172  	if b.DomainName != "" {
   173  		domainName = b.DomainName
   174  	}
   175  
   176  	// set configuration and check if container creation successful
   177  	if err = c.conf(&containerConfig{
   178  		WorkDir:       workDir,
   179  		HostName:      hostName,
   180  		DomainName:    domainName,
   181  		ContainerRoot: root,
   182  		Mounts:        mounts,
   183  		SymbolicLinks: links,
   184  		MaskPaths:     maskPaths,
   185  		InitCommand:   b.InitCommand,
   186  		Cred:          b.CredGenerator != nil,
   187  		ContainerUID:  b.ContainerUID,
   188  		ContainerGID:  b.ContainerGID,
   189  		UnshareCgroup: b.CloneFlags&unix.CLONE_NEWCGROUP == unix.CLONE_NEWCGROUP,
   190  	}); err != nil {
   191  		c.Destroy()
   192  		return nil, err
   193  	}
   194  	return c, nil
   195  }
   196  
   197  func (b *Builder) startContainer() (*container, error) {
   198  	var (
   199  		err            error
   200  		cred           syscall.Credential
   201  		uidMap, gidMap []syscall.SysProcIDMap
   202  	)
   203  	// prepare host <-> container unix socket
   204  	ins, outs, err := newPassCredSocketPair()
   205  	if err != nil {
   206  		return nil, fmt.Errorf("container: failed to create socket: %v", err)
   207  	}
   208  	defer outs.Close()
   209  
   210  	outf, err := outs.File()
   211  	if err != nil {
   212  		ins.Close()
   213  		return nil, fmt.Errorf("container: failed to dup container socket fd %v", err)
   214  	}
   215  	defer outf.Close()
   216  
   217  	// prepare container running credential
   218  	if b.CredGenerator != nil {
   219  		cred = b.CredGenerator.Get()
   220  		uidMap, gidMap = b.getIDMapping(&cred)
   221  	} else {
   222  		uidMap = []syscall.SysProcIDMap{{HostID: os.Geteuid(), Size: 1}}
   223  		gidMap = []syscall.SysProcIDMap{{HostID: os.Getegid(), Size: 1}}
   224  	}
   225  
   226  	var cloneFlag uintptr
   227  	if b.CloneFlags == 0 {
   228  		cloneFlag = forkexec.UnshareFlags
   229  	} else {
   230  		cloneFlag = b.CloneFlags & forkexec.UnshareFlags
   231  	}
   232  
   233  	exe := "/proc/self/exe"
   234  	if b.ExecFile != "" {
   235  		exe = b.ExecFile
   236  	}
   237  	args := []string{exe, initArg}
   238  
   239  	r := exec.Cmd{
   240  		Path:       exe,
   241  		Args:       args,
   242  		Env:        []string{PathEnv},
   243  		Stderr:     b.Stderr,
   244  		ExtraFiles: []*os.File{outf},
   245  		SysProcAttr: &syscall.SysProcAttr{
   246  			Cloneflags:  cloneFlag,
   247  			UidMappings: uidMap,
   248  			GidMappings: gidMap,
   249  			AmbientCaps: []uintptr{
   250  				unix.CAP_SYS_ADMIN,
   251  				unix.CAP_SYS_RESOURCE,
   252  			},
   253  			Pdeathsig: syscall.SIGTERM,
   254  		},
   255  	}
   256  	if err = r.Start(); err != nil {
   257  		ins.Close()
   258  		return nil, fmt.Errorf("container: failed to start container %v", err)
   259  	}
   260  	c := &container{
   261  		process: r.Process,
   262  		socket:  newSocket(ins),
   263  		recvCh:  make(chan recvReply, 1),
   264  		sendCh:  make(chan sendCmd, 1),
   265  		done:    make(chan struct{}),
   266  	}
   267  	go c.sendLoop()
   268  	go c.recvLoop()
   269  
   270  	return c, nil
   271  }
   272  
   273  func (c *container) sendLoop() {
   274  	for {
   275  		select {
   276  		case <-c.done:
   277  			return
   278  
   279  		case cmd, ok := <-c.sendCh:
   280  			if !ok {
   281  				return
   282  			}
   283  			if err := c.socket.SendMsg(cmd.Cmd, cmd.Msg); err != nil {
   284  				c.socketError(err)
   285  				return
   286  			}
   287  		}
   288  	}
   289  }
   290  
   291  func (c *container) recvLoop() {
   292  	for {
   293  		var reply reply
   294  		msg, err := c.socket.RecvMsg(&reply)
   295  		if err != nil {
   296  			c.socketError(err)
   297  			return
   298  		}
   299  		c.recvCh <- recvReply{
   300  			Reply: reply,
   301  			Msg:   msg,
   302  		}
   303  	}
   304  }
   305  
   306  func (c *container) socketError(err error) {
   307  	c.doneOnce.Do(func() {
   308  		c.err = err
   309  		close(c.done)
   310  	})
   311  }
   312  
   313  // Destroy kill the container process (with its children)
   314  // if stderr enabled, collect the output as error
   315  func (c *container) Destroy() error {
   316  	// close socket (abort any ongoing command)
   317  	c.socket.Close()
   318  
   319  	// wait commands terminates
   320  	c.mu.Lock()
   321  	defer c.mu.Unlock()
   322  
   323  	// kill process
   324  	c.process.Kill()
   325  	_, err := c.process.Wait()
   326  	return err
   327  }
   328  
   329  // newPassCredSocketPair creates socket pair and let the first socket to receive credential information
   330  func newPassCredSocketPair() (*unixsocket.Socket, *unixsocket.Socket, error) {
   331  	ins, outs, err := unixsocket.NewSocketPair()
   332  	if err != nil {
   333  		return nil, nil, err
   334  	}
   335  	if err = ins.SetPassCred(1); err != nil {
   336  		ins.Close()
   337  		outs.Close()
   338  		return nil, nil, err
   339  	}
   340  	return ins, outs, nil
   341  }
   342  
   343  func (b *Builder) getIDMapping(cred *syscall.Credential) ([]syscall.SysProcIDMap, []syscall.SysProcIDMap) {
   344  	cUID := b.ContainerUID
   345  	if cUID == 0 {
   346  		cUID = containerUID
   347  	}
   348  
   349  	cGID := b.ContainerGID
   350  	if cGID == 0 {
   351  		cGID = containerGID
   352  	}
   353  
   354  	uidMap := []syscall.SysProcIDMap{
   355  		{
   356  			ContainerID: 0,
   357  			HostID:      os.Geteuid(),
   358  			Size:        1,
   359  		},
   360  		{
   361  			ContainerID: cUID,
   362  			HostID:      int(cred.Uid),
   363  			Size:        1,
   364  		},
   365  	}
   366  
   367  	gidMap := []syscall.SysProcIDMap{
   368  		{
   369  			ContainerID: 0,
   370  			HostID:      os.Getegid(),
   371  			Size:        1,
   372  		},
   373  		{
   374  			ContainerID: cGID,
   375  			HostID:      int(cred.Gid),
   376  			Size:        1,
   377  		},
   378  	}
   379  
   380  	return uidMap, gidMap
   381  }
   382  
   383  func (c *container) recvAckReply(name string) error {
   384  	reply, _, err := c.recvReply()
   385  	if err != nil {
   386  		return fmt.Errorf("%v: recvAck %v", name, err)
   387  	}
   388  	if reply.Error != nil {
   389  		return fmt.Errorf("%v: container error %v", name, reply.Error)
   390  	}
   391  	return nil
   392  }
   393  func (c *container) recvReply() (reply, unixsocket.Msg, error) {
   394  	select {
   395  	case <-c.done:
   396  		return reply{}, unixsocket.Msg{}, c.err
   397  
   398  	case recv := <-c.recvCh:
   399  		return recv.Reply, recv.Msg, nil
   400  	}
   401  }
   402  
   403  func (c *container) sendCmd(cmd cmd, msg unixsocket.Msg) error {
   404  	select {
   405  	case <-c.done:
   406  		return c.err
   407  
   408  	case c.sendCh <- sendCmd{Cmd: cmd, Msg: msg}:
   409  		return nil
   410  	}
   411  }