github.com/criyle/go-sandbox@v0.10.3/container/container_init_linux.go (about) 1 package container 2 3 import ( 4 "errors" 5 "fmt" 6 "os" 7 "os/exec" 8 "os/signal" 9 "path/filepath" 10 "runtime" 11 "strconv" 12 "sync" 13 "syscall" 14 15 "github.com/criyle/go-sandbox/pkg/unixsocket" 16 ) 17 18 type containerServer struct { 19 socket *socket 20 containerConfig 21 defaultEnv []string 22 23 done chan struct{} 24 err error 25 doneOnce sync.Once 26 27 recvCh chan recvCmd 28 sendCh chan sendReply 29 30 waitPid chan int 31 waitPidResult chan waitPidResult 32 33 waitAll chan struct{} 34 waitAllDone chan struct{} 35 } 36 37 type recvCmd struct { 38 Cmd cmd 39 Msg unixsocket.Msg 40 } 41 42 type sendReply struct { 43 Reply reply 44 Msg unixsocket.Msg 45 FileToClose []*os.File 46 } 47 48 type waitPidResult struct { 49 WaitStatus syscall.WaitStatus 50 Rusage syscall.Rusage 51 Err error 52 } 53 54 // Init is called for container init process 55 // it will check if pid == 1, otherwise it is noop 56 // Init will do infinite loop on socket commands, 57 // and exits when at socket close, use it in init function 58 func Init() (err error) { 59 // noop if self is not container init process 60 // Notice: docker init is also 1, additional check for args[1] == init 61 if os.Getpid() != 1 || len(os.Args) < 2 || os.Args[1] != initArg { 62 return nil 63 } 64 65 // exit process (with whole container) upon exit this function 66 // possible reason: 67 // 1. socket broken (parent exit) 68 // 2. panic 69 // 3. undefined cmd (possible race condition) 70 defer func() { 71 if err := recover(); err != nil { 72 fmt.Fprintf(os.Stderr, "container_exit: panic: %v\n", err) 73 os.Exit(1) 74 } 75 if err != nil { 76 fmt.Fprintf(os.Stderr, "container_exit: %v\n", err) 77 os.Exit(1) 78 } 79 fmt.Fprintf(os.Stderr, "container_exit\n") 80 os.Exit(0) 81 }() 82 83 // ignore any signal that kills the init process 84 ignoreSignals() 85 86 // limit container resource usage 87 runtime.GOMAXPROCS(containerMaxProc) 88 89 // ensure there's no fd leak to child process (e.g. VSCode leaks ptmx fd) 90 if err := closeOnExecAllFds(); err != nil { 91 return fmt.Errorf("container_init: failed to close_on_exec all fd %v", err) 92 } 93 94 // new_container environment shared the socket at fd 3 (marked close_exec) 95 const defaultFd = 3 96 soc, err := unixsocket.NewSocket(defaultFd) 97 if err != nil { 98 return fmt.Errorf("container_init: failed to new socket %v", err) 99 } 100 101 // serve forever 102 cs := &containerServer{ 103 socket: newSocket(soc), 104 done: make(chan struct{}), 105 sendCh: make(chan sendReply, 1), 106 recvCh: make(chan recvCmd, 1), 107 waitPid: make(chan int), 108 waitAll: make(chan struct{}), 109 waitPidResult: make(chan waitPidResult, 1), 110 waitAllDone: make(chan struct{}, 1), 111 } 112 go cs.sendLoop() 113 go cs.recvLoop() 114 go cs.waitLoop() 115 116 return cs.serve() 117 } 118 119 func (c *containerServer) sendLoop() { 120 for { 121 select { 122 case <-c.done: 123 return 124 125 case rep, ok := <-c.sendCh: 126 if !ok { 127 return 128 } 129 err := c.socket.SendMsg(rep.Reply, rep.Msg) 130 for _, f := range rep.FileToClose { 131 f.Close() 132 } 133 if err != nil { 134 c.socketError(err) 135 return 136 } 137 } 138 } 139 } 140 141 func (c *containerServer) recvLoop() { 142 for { 143 var cmd cmd 144 msg, err := c.socket.RecvMsg(&cmd) 145 if err != nil { 146 c.socketError(err) 147 return 148 } 149 c.recvCh <- recvCmd{ 150 Cmd: cmd, 151 Msg: msg, 152 } 153 } 154 } 155 156 func (c *containerServer) socketError(err error) { 157 c.doneOnce.Do(func() { 158 c.err = err 159 close(c.done) 160 }) 161 } 162 163 func (c *containerServer) waitLoop() { 164 for { 165 select { 166 case pid := <-c.waitPid: 167 var waitStatus syscall.WaitStatus 168 var rusage syscall.Rusage 169 170 _, err := syscall.Wait4(pid, &waitStatus, 0, &rusage) 171 for err == syscall.EINTR { 172 _, err = syscall.Wait4(pid, &waitStatus, 0, &rusage) 173 } 174 if err != nil { 175 c.waitPidResult <- waitPidResult{ 176 Err: err, 177 } 178 continue 179 } 180 c.waitPidResult <- waitPidResult{ 181 WaitStatus: waitStatus, 182 Rusage: rusage, 183 } 184 185 case <-c.waitAll: 186 for { 187 if _, err := syscall.Wait4(-1, nil, syscall.WNOHANG, nil); err != nil && err != syscall.EINTR { 188 break 189 } 190 } 191 c.waitAllDone <- struct{}{} 192 } 193 } 194 } 195 196 func (c *containerServer) serve() error { 197 for { 198 cmd, msg, err := c.recvCmd() 199 if err != nil { 200 return fmt.Errorf("serve: recvCmd %v", err) 201 } 202 if err := c.handleCmd(cmd, msg); err != nil { 203 return fmt.Errorf("serve: failed to execute cmd %v", err) 204 } 205 } 206 } 207 208 func (c *containerServer) handleCmd(cmd cmd, msg unixsocket.Msg) error { 209 switch cmd.Cmd { 210 case cmdPing: 211 return c.handlePing() 212 213 case cmdConf: 214 return c.handleConf(cmd.ConfCmd) 215 216 case cmdOpen: 217 return c.handleOpen(cmd.OpenCmd) 218 219 case cmdDelete: 220 return c.handleDelete(cmd.DeleteCmd) 221 222 case cmdReset: 223 return c.handleReset() 224 225 case cmdExecve: 226 return c.handleExecve(cmd.ExecCmd, msg) 227 } 228 return fmt.Errorf("unknown command: %v", cmd.Cmd) 229 } 230 231 func initContainer(c containerConfig) error { 232 if err := initFileSystem(c); err != nil { 233 return err 234 } 235 if err := syscall.Setdomainname([]byte(c.DomainName)); err != nil { 236 return err 237 } 238 if err := syscall.Sethostname([]byte(c.HostName)); err != nil { 239 return err 240 } 241 if err := os.Chdir(c.WorkDir); err != nil { 242 return err 243 } 244 if len(c.InitCommand) > 0 { 245 cm := exec.Command(c.InitCommand[0], c.InitCommand[1:]...) 246 if output, err := cm.CombinedOutput(); err != nil { 247 os.Stderr.Write(output) 248 return err 249 } 250 } 251 return nil 252 } 253 254 func initFileSystem(c containerConfig) error { 255 // mount tmpfs as root 256 const tmpfs = "tmpfs" 257 if err := syscall.Mount(tmpfs, c.ContainerRoot, tmpfs, 0, ""); err != nil { 258 return fmt.Errorf("init_fs: mount / %v", err) 259 } 260 // change dir to container root 261 if err := syscall.Chdir(c.ContainerRoot); err != nil { 262 return fmt.Errorf("init_fs: chdir %v", err) 263 } 264 // performing mounts 265 for _, m := range c.Mounts { 266 if err := m.Mount(); err != nil { 267 return fmt.Errorf("init_fs: mount %v %v", m, err) 268 } 269 } 270 // pivot root 271 const oldRoot = "old_root" 272 if err := os.Mkdir(oldRoot, 0755); err != nil { 273 return fmt.Errorf("init_fs: mkdir(old_root) %v", err) 274 } 275 if err := syscall.PivotRoot(c.ContainerRoot, oldRoot); err != nil { 276 return fmt.Errorf("init_fs: pivot_root(%s, %s) %v", c.ContainerRoot, oldRoot, err) 277 } 278 if err := syscall.Unmount(oldRoot, syscall.MNT_DETACH); err != nil { 279 return fmt.Errorf("init_fs: unmount(old_root) %v", err) 280 } 281 if err := os.Remove(oldRoot); err != nil { 282 return fmt.Errorf("init_fs: unlink(old_root) %v", err) 283 } 284 // create symlinks 285 for _, l := range c.SymbolicLinks { 286 // ensure dir exists 287 dir := filepath.Dir(l.LinkPath) 288 if err := os.MkdirAll(dir, 0755); err != nil { 289 return fmt.Errorf("init_fs: mkdir_all(%s) %v", dir, err) 290 } 291 if err := os.Symlink(l.Target, l.LinkPath); err != nil { 292 return fmt.Errorf("init_fs: symlink %v", err) 293 } 294 } 295 // mask paths 296 for _, p := range c.MaskPaths { 297 if err := maskPath(p); err != nil { 298 return fmt.Errorf("init_fs: mask path %v", err) 299 } 300 } 301 // readonly root 302 const remountFlag = syscall.MS_BIND | syscall.MS_REMOUNT | syscall.MS_RDONLY | syscall.MS_NOATIME | syscall.MS_NOSUID 303 if err := syscall.Mount(tmpfs, "/", tmpfs, remountFlag, ""); err != nil { 304 return fmt.Errorf("init_fs: readonly remount / %v", err) 305 } 306 return nil 307 } 308 309 func (c *containerServer) recvCmd() (cmd, unixsocket.Msg, error) { 310 select { 311 case <-c.done: 312 return cmd{}, unixsocket.Msg{}, c.err 313 314 case recv := <-c.recvCh: 315 return recv.Cmd, recv.Msg, nil 316 } 317 } 318 319 func (c *containerServer) sendReplyFiles(rep reply, msg unixsocket.Msg, fileToClose []*os.File) error { 320 select { 321 case <-c.done: 322 return c.err 323 324 case c.sendCh <- sendReply{Reply: rep, Msg: msg, FileToClose: fileToClose}: 325 return nil 326 } 327 } 328 329 func (c *containerServer) sendReply(rep reply, msg unixsocket.Msg) error { 330 return c.sendReplyFiles(rep, msg, nil) 331 } 332 333 // sendErrorReply sends error reply 334 func (c *containerServer) sendErrorReply(ft string, v ...interface{}) error { 335 errRep := &errorReply{ 336 Msg: fmt.Sprintf(ft, v...), 337 } 338 // store errno 339 if len(v) == 1 { 340 if errno, ok := v[0].(syscall.Errno); ok { 341 errRep.Errno = &errno 342 } 343 } 344 return c.sendReply(reply{Error: errRep}, unixsocket.Msg{}) 345 } 346 347 func closeOnExecAllFds() error { 348 // get all fd from /proc/self/fd 349 const fdPath = "/proc/self/fd" 350 fds, err := os.ReadDir(fdPath) 351 if err != nil { 352 return err 353 } 354 for _, f := range fds { 355 fd, err := strconv.Atoi(f.Name()) 356 if err != nil { 357 return err 358 } 359 syscall.CloseOnExec(fd) 360 } 361 return nil 362 } 363 364 func maskPath(path string) error { 365 // bind mount /dev/null if it is file 366 if err := syscall.Mount("/dev/null", path, "", syscall.MS_BIND, ""); err != nil && !errors.Is(err, os.ErrNotExist) { 367 if errors.Is(err, syscall.ENOTDIR) { 368 // otherwise, mount tmpfs to mask it 369 return syscall.Mount("tmpfs", path, "tmpfs", syscall.MS_RDONLY, "") 370 } 371 return err 372 } 373 return nil 374 } 375 376 func ignoreSignals() { 377 signal.Ignore(signalToIgnore...) 378 }