github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/fsimpl/proc/tasks_sys.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package proc
    16  
    17  import (
    18  	"bytes"
    19  	"fmt"
    20  	"math"
    21  
    22  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    23  	"github.com/nicocha30/gvisor-ligolo/pkg/context"
    24  	"github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr"
    25  	"github.com/nicocha30/gvisor-ligolo/pkg/hostarch"
    26  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/kernfs"
    27  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/inet"
    28  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel"
    29  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/auth"
    30  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs"
    31  	"github.com/nicocha30/gvisor-ligolo/pkg/sync"
    32  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip/network/ipv4"
    33  	"github.com/nicocha30/gvisor-ligolo/pkg/usermem"
    34  )
    35  
    36  // +stateify savable
    37  type tcpMemDir int
    38  
    39  const (
    40  	tcpRMem tcpMemDir = iota
    41  	tcpWMem
    42  )
    43  
    44  // newSysDir returns the dentry corresponding to /proc/sys directory.
    45  func (fs *filesystem) newSysDir(ctx context.Context, root *auth.Credentials, k *kernel.Kernel) kernfs.Inode {
    46  	return fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
    47  		"kernel": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
    48  			"hostname": fs.newInode(ctx, root, 0444, &hostnameData{}),
    49  			"sem":      fs.newInode(ctx, root, 0444, newStaticFile(fmt.Sprintf("%d\t%d\t%d\t%d\n", linux.SEMMSL, linux.SEMMNS, linux.SEMOPM, linux.SEMMNI))),
    50  			"shmall":   fs.newInode(ctx, root, 0444, ipcData(linux.SHMALL)),
    51  			"shmmax":   fs.newInode(ctx, root, 0444, ipcData(linux.SHMMAX)),
    52  			"shmmni":   fs.newInode(ctx, root, 0444, ipcData(linux.SHMMNI)),
    53  			"msgmni":   fs.newInode(ctx, root, 0444, ipcData(linux.MSGMNI)),
    54  			"msgmax":   fs.newInode(ctx, root, 0444, ipcData(linux.MSGMAX)),
    55  			"msgmnb":   fs.newInode(ctx, root, 0444, ipcData(linux.MSGMNB)),
    56  			"yama": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
    57  				"ptrace_scope": fs.newYAMAPtraceScopeFile(ctx, k, root),
    58  			}),
    59  		}),
    60  		"vm": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
    61  			"max_map_count":     fs.newInode(ctx, root, 0444, newStaticFile("2147483647\n")),
    62  			"mmap_min_addr":     fs.newInode(ctx, root, 0444, &mmapMinAddrData{k: k}),
    63  			"overcommit_memory": fs.newInode(ctx, root, 0444, newStaticFile("0\n")),
    64  		}),
    65  		"net": fs.newSysNetDir(ctx, root, k),
    66  	})
    67  }
    68  
    69  // newSysNetDir returns the dentry corresponding to /proc/sys/net directory.
    70  func (fs *filesystem) newSysNetDir(ctx context.Context, root *auth.Credentials, k *kernel.Kernel) kernfs.Inode {
    71  	var contents map[string]kernfs.Inode
    72  
    73  	// TODO(gvisor.dev/issue/1833): Support for using the network stack in the
    74  	// network namespace of the calling process.
    75  	if stack := k.RootNetworkNamespace().Stack(); stack != nil {
    76  		contents = map[string]kernfs.Inode{
    77  			"ipv4": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
    78  				"ip_forward":          fs.newInode(ctx, root, 0444, &ipForwarding{stack: stack}),
    79  				"ip_local_port_range": fs.newInode(ctx, root, 0644, &portRange{stack: stack}),
    80  				"tcp_recovery":        fs.newInode(ctx, root, 0644, &tcpRecoveryData{stack: stack}),
    81  				"tcp_rmem":            fs.newInode(ctx, root, 0644, &tcpMemData{stack: stack, dir: tcpRMem}),
    82  				"tcp_sack":            fs.newInode(ctx, root, 0644, &tcpSackData{stack: stack}),
    83  				"tcp_wmem":            fs.newInode(ctx, root, 0644, &tcpMemData{stack: stack, dir: tcpWMem}),
    84  
    85  				// The following files are simple stubs until they are implemented in
    86  				// netstack, most of these files are configuration related. We use the
    87  				// value closest to the actual netstack behavior or any empty file, all
    88  				// of these files will have mode 0444 (read-only for all users).
    89  				"ip_local_reserved_ports": fs.newInode(ctx, root, 0444, newStaticFile("")),
    90  				"ipfrag_time":             fs.newInode(ctx, root, 0444, newStaticFile("30")),
    91  				"ip_nonlocal_bind":        fs.newInode(ctx, root, 0444, newStaticFile("0")),
    92  				"ip_no_pmtu_disc":         fs.newInode(ctx, root, 0444, newStaticFile("1")),
    93  
    94  				// tcp_allowed_congestion_control tell the user what they are able to
    95  				// do as an unprivledged process so we leave it empty.
    96  				"tcp_allowed_congestion_control":   fs.newInode(ctx, root, 0444, newStaticFile("")),
    97  				"tcp_available_congestion_control": fs.newInode(ctx, root, 0444, newStaticFile("reno")),
    98  				"tcp_congestion_control":           fs.newInode(ctx, root, 0444, newStaticFile("reno")),
    99  
   100  				// Many of the following stub files are features netstack doesn't
   101  				// support. The unsupported features return "0" to indicate they are
   102  				// disabled.
   103  				"tcp_base_mss":              fs.newInode(ctx, root, 0444, newStaticFile("1280")),
   104  				"tcp_dsack":                 fs.newInode(ctx, root, 0444, newStaticFile("0")),
   105  				"tcp_early_retrans":         fs.newInode(ctx, root, 0444, newStaticFile("0")),
   106  				"tcp_fack":                  fs.newInode(ctx, root, 0444, newStaticFile("0")),
   107  				"tcp_fastopen":              fs.newInode(ctx, root, 0444, newStaticFile("0")),
   108  				"tcp_fastopen_key":          fs.newInode(ctx, root, 0444, newStaticFile("")),
   109  				"tcp_invalid_ratelimit":     fs.newInode(ctx, root, 0444, newStaticFile("0")),
   110  				"tcp_keepalive_intvl":       fs.newInode(ctx, root, 0444, newStaticFile("0")),
   111  				"tcp_keepalive_probes":      fs.newInode(ctx, root, 0444, newStaticFile("0")),
   112  				"tcp_keepalive_time":        fs.newInode(ctx, root, 0444, newStaticFile("7200")),
   113  				"tcp_mtu_probing":           fs.newInode(ctx, root, 0444, newStaticFile("0")),
   114  				"tcp_no_metrics_save":       fs.newInode(ctx, root, 0444, newStaticFile("1")),
   115  				"tcp_probe_interval":        fs.newInode(ctx, root, 0444, newStaticFile("0")),
   116  				"tcp_probe_threshold":       fs.newInode(ctx, root, 0444, newStaticFile("0")),
   117  				"tcp_retries1":              fs.newInode(ctx, root, 0444, newStaticFile("3")),
   118  				"tcp_retries2":              fs.newInode(ctx, root, 0444, newStaticFile("15")),
   119  				"tcp_rfc1337":               fs.newInode(ctx, root, 0444, newStaticFile("1")),
   120  				"tcp_slow_start_after_idle": fs.newInode(ctx, root, 0444, newStaticFile("1")),
   121  				"tcp_synack_retries":        fs.newInode(ctx, root, 0444, newStaticFile("5")),
   122  				"tcp_syn_retries":           fs.newInode(ctx, root, 0444, newStaticFile("3")),
   123  				"tcp_timestamps":            fs.newInode(ctx, root, 0444, newStaticFile("1")),
   124  			}),
   125  			"core": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
   126  				"default_qdisc": fs.newInode(ctx, root, 0444, newStaticFile("pfifo_fast")),
   127  				"message_burst": fs.newInode(ctx, root, 0444, newStaticFile("10")),
   128  				"message_cost":  fs.newInode(ctx, root, 0444, newStaticFile("5")),
   129  				"optmem_max":    fs.newInode(ctx, root, 0444, newStaticFile("0")),
   130  				"rmem_default":  fs.newInode(ctx, root, 0444, newStaticFile("212992")),
   131  				"rmem_max":      fs.newInode(ctx, root, 0444, newStaticFile("212992")),
   132  				"somaxconn":     fs.newInode(ctx, root, 0444, newStaticFile("128")),
   133  				"wmem_default":  fs.newInode(ctx, root, 0444, newStaticFile("212992")),
   134  				"wmem_max":      fs.newInode(ctx, root, 0444, newStaticFile("212992")),
   135  			}),
   136  		}
   137  	}
   138  
   139  	return fs.newStaticDir(ctx, root, contents)
   140  }
   141  
   142  // mmapMinAddrData implements vfs.DynamicBytesSource for
   143  // /proc/sys/vm/mmap_min_addr.
   144  //
   145  // +stateify savable
   146  type mmapMinAddrData struct {
   147  	kernfs.DynamicBytesFile
   148  
   149  	k *kernel.Kernel
   150  }
   151  
   152  var _ dynamicInode = (*mmapMinAddrData)(nil)
   153  
   154  // Generate implements vfs.DynamicBytesSource.Generate.
   155  func (d *mmapMinAddrData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   156  	fmt.Fprintf(buf, "%d\n", d.k.Platform.MinUserAddress())
   157  	return nil
   158  }
   159  
   160  // hostnameData implements vfs.DynamicBytesSource for /proc/sys/kernel/hostname.
   161  //
   162  // +stateify savable
   163  type hostnameData struct {
   164  	kernfs.DynamicBytesFile
   165  }
   166  
   167  var _ dynamicInode = (*hostnameData)(nil)
   168  
   169  // Generate implements vfs.DynamicBytesSource.Generate.
   170  func (*hostnameData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   171  	utsns := kernel.UTSNamespaceFromContext(ctx)
   172  	buf.WriteString(utsns.HostName())
   173  	buf.WriteString("\n")
   174  	return nil
   175  }
   176  
   177  // tcpSackData implements vfs.WritableDynamicBytesSource for
   178  // /proc/sys/net/tcp_sack.
   179  //
   180  // +stateify savable
   181  type tcpSackData struct {
   182  	kernfs.DynamicBytesFile
   183  
   184  	stack   inet.Stack `state:"wait"`
   185  	enabled *bool
   186  }
   187  
   188  var _ vfs.WritableDynamicBytesSource = (*tcpSackData)(nil)
   189  
   190  // Generate implements vfs.DynamicBytesSource.Generate.
   191  func (d *tcpSackData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   192  	if d.enabled == nil {
   193  		sack, err := d.stack.TCPSACKEnabled()
   194  		if err != nil {
   195  			return err
   196  		}
   197  		d.enabled = &sack
   198  	}
   199  
   200  	val := "0\n"
   201  	if *d.enabled {
   202  		// Technically, this is not quite compatible with Linux. Linux stores these
   203  		// as an integer, so if you write "2" into tcp_sack, you should get 2 back.
   204  		// Tough luck.
   205  		val = "1\n"
   206  	}
   207  	_, err := buf.WriteString(val)
   208  	return err
   209  }
   210  
   211  // Write implements vfs.WritableDynamicBytesSource.Write.
   212  func (d *tcpSackData) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) {
   213  	if offset != 0 {
   214  		// No need to handle partial writes thus far.
   215  		return 0, linuxerr.EINVAL
   216  	}
   217  	if src.NumBytes() == 0 {
   218  		return 0, nil
   219  	}
   220  
   221  	// Limit the amount of memory allocated.
   222  	src = src.TakeFirst(hostarch.PageSize - 1)
   223  
   224  	var v int32
   225  	n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
   226  	if err != nil {
   227  		return 0, err
   228  	}
   229  	if d.enabled == nil {
   230  		d.enabled = new(bool)
   231  	}
   232  	*d.enabled = v != 0
   233  	return n, d.stack.SetTCPSACKEnabled(*d.enabled)
   234  }
   235  
   236  // tcpRecoveryData implements vfs.WritableDynamicBytesSource for
   237  // /proc/sys/net/ipv4/tcp_recovery.
   238  //
   239  // +stateify savable
   240  type tcpRecoveryData struct {
   241  	kernfs.DynamicBytesFile
   242  
   243  	stack inet.Stack `state:"wait"`
   244  }
   245  
   246  var _ vfs.WritableDynamicBytesSource = (*tcpRecoveryData)(nil)
   247  
   248  // Generate implements vfs.DynamicBytesSource.Generate.
   249  func (d *tcpRecoveryData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   250  	recovery, err := d.stack.TCPRecovery()
   251  	if err != nil {
   252  		return err
   253  	}
   254  
   255  	_, err = buf.WriteString(fmt.Sprintf("%d\n", recovery))
   256  	return err
   257  }
   258  
   259  // Write implements vfs.WritableDynamicBytesSource.Write.
   260  func (d *tcpRecoveryData) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) {
   261  	if offset != 0 {
   262  		// No need to handle partial writes thus far.
   263  		return 0, linuxerr.EINVAL
   264  	}
   265  	if src.NumBytes() == 0 {
   266  		return 0, nil
   267  	}
   268  
   269  	// Limit the amount of memory allocated.
   270  	src = src.TakeFirst(hostarch.PageSize - 1)
   271  
   272  	var v int32
   273  	n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
   274  	if err != nil {
   275  		return 0, err
   276  	}
   277  	if err := d.stack.SetTCPRecovery(inet.TCPLossRecovery(v)); err != nil {
   278  		return 0, err
   279  	}
   280  	return n, nil
   281  }
   282  
   283  // tcpMemData implements vfs.WritableDynamicBytesSource for
   284  // /proc/sys/net/ipv4/tcp_rmem and /proc/sys/net/ipv4/tcp_wmem.
   285  //
   286  // +stateify savable
   287  type tcpMemData struct {
   288  	kernfs.DynamicBytesFile
   289  
   290  	dir   tcpMemDir
   291  	stack inet.Stack `state:"wait"`
   292  
   293  	// mu protects against concurrent reads/writes to FDs based on the dentry
   294  	// backing this byte source.
   295  	mu sync.Mutex `state:"nosave"`
   296  }
   297  
   298  var _ vfs.WritableDynamicBytesSource = (*tcpMemData)(nil)
   299  
   300  // Generate implements vfs.DynamicBytesSource.Generate.
   301  func (d *tcpMemData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   302  	d.mu.Lock()
   303  	defer d.mu.Unlock()
   304  
   305  	size, err := d.readSizeLocked()
   306  	if err != nil {
   307  		return err
   308  	}
   309  	_, err = buf.WriteString(fmt.Sprintf("%d\t%d\t%d\n", size.Min, size.Default, size.Max))
   310  	return err
   311  }
   312  
   313  // Write implements vfs.WritableDynamicBytesSource.Write.
   314  func (d *tcpMemData) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) {
   315  	if offset != 0 {
   316  		// No need to handle partial writes thus far.
   317  		return 0, linuxerr.EINVAL
   318  	}
   319  	if src.NumBytes() == 0 {
   320  		return 0, nil
   321  	}
   322  	d.mu.Lock()
   323  	defer d.mu.Unlock()
   324  
   325  	// Limit the amount of memory allocated.
   326  	src = src.TakeFirst(hostarch.PageSize - 1)
   327  	size, err := d.readSizeLocked()
   328  	if err != nil {
   329  		return 0, err
   330  	}
   331  	buf := []int32{int32(size.Min), int32(size.Default), int32(size.Max)}
   332  	n, err := usermem.CopyInt32StringsInVec(ctx, src.IO, src.Addrs, buf, src.Opts)
   333  	if err != nil {
   334  		return 0, err
   335  	}
   336  	newSize := inet.TCPBufferSize{
   337  		Min:     int(buf[0]),
   338  		Default: int(buf[1]),
   339  		Max:     int(buf[2]),
   340  	}
   341  	if err := d.writeSizeLocked(newSize); err != nil {
   342  		return 0, err
   343  	}
   344  	return n, nil
   345  }
   346  
   347  // Precondition: d.mu must be locked.
   348  func (d *tcpMemData) readSizeLocked() (inet.TCPBufferSize, error) {
   349  	switch d.dir {
   350  	case tcpRMem:
   351  		return d.stack.TCPReceiveBufferSize()
   352  	case tcpWMem:
   353  		return d.stack.TCPSendBufferSize()
   354  	default:
   355  		panic(fmt.Sprintf("unknown tcpMemFile type: %v", d.dir))
   356  	}
   357  }
   358  
   359  // Precondition: d.mu must be locked.
   360  func (d *tcpMemData) writeSizeLocked(size inet.TCPBufferSize) error {
   361  	switch d.dir {
   362  	case tcpRMem:
   363  		return d.stack.SetTCPReceiveBufferSize(size)
   364  	case tcpWMem:
   365  		return d.stack.SetTCPSendBufferSize(size)
   366  	default:
   367  		panic(fmt.Sprintf("unknown tcpMemFile type: %v", d.dir))
   368  	}
   369  }
   370  
   371  // ipForwarding implements vfs.WritableDynamicBytesSource for
   372  // /proc/sys/net/ipv4/ip_forward.
   373  //
   374  // +stateify savable
   375  type ipForwarding struct {
   376  	kernfs.DynamicBytesFile
   377  
   378  	stack   inet.Stack `state:"wait"`
   379  	enabled bool
   380  }
   381  
   382  var _ vfs.WritableDynamicBytesSource = (*ipForwarding)(nil)
   383  
   384  // Generate implements vfs.DynamicBytesSource.Generate.
   385  func (ipf *ipForwarding) Generate(ctx context.Context, buf *bytes.Buffer) error {
   386  	val := "0\n"
   387  	if ipf.enabled {
   388  		// Technically, this is not quite compatible with Linux. Linux stores these
   389  		// as an integer, so if you write "2" into tcp_sack, you should get 2 back.
   390  		// Tough luck.
   391  		val = "1\n"
   392  	}
   393  	buf.WriteString(val)
   394  
   395  	return nil
   396  }
   397  
   398  // Write implements vfs.WritableDynamicBytesSource.Write.
   399  func (ipf *ipForwarding) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) {
   400  	if offset != 0 {
   401  		// No need to handle partial writes thus far.
   402  		return 0, linuxerr.EINVAL
   403  	}
   404  	if src.NumBytes() == 0 {
   405  		return 0, nil
   406  	}
   407  
   408  	// Limit input size so as not to impact performance if input size is large.
   409  	src = src.TakeFirst(hostarch.PageSize - 1)
   410  
   411  	var v int32
   412  	n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
   413  	if err != nil {
   414  		return 0, err
   415  	}
   416  	ipf.enabled = v != 0
   417  	if err := ipf.stack.SetForwarding(ipv4.ProtocolNumber, ipf.enabled); err != nil {
   418  		return 0, err
   419  	}
   420  	return n, nil
   421  }
   422  
   423  // portRange implements vfs.WritableDynamicBytesSource for
   424  // /proc/sys/net/ipv4/ip_local_port_range.
   425  //
   426  // +stateify savable
   427  type portRange struct {
   428  	kernfs.DynamicBytesFile
   429  
   430  	stack inet.Stack `state:"wait"`
   431  
   432  	// start and end store the port range. We must save/restore this here,
   433  	// since a netstack instance is created on restore.
   434  	start *uint16
   435  	end   *uint16
   436  }
   437  
   438  var _ vfs.WritableDynamicBytesSource = (*portRange)(nil)
   439  
   440  // Generate implements vfs.DynamicBytesSource.Generate.
   441  func (pr *portRange) Generate(ctx context.Context, buf *bytes.Buffer) error {
   442  	if pr.start == nil {
   443  		start, end := pr.stack.PortRange()
   444  		pr.start = &start
   445  		pr.end = &end
   446  	}
   447  	_, err := fmt.Fprintf(buf, "%d %d\n", *pr.start, *pr.end)
   448  	return err
   449  }
   450  
   451  // Write implements vfs.WritableDynamicBytesSource.Write.
   452  func (pr *portRange) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) {
   453  	if offset != 0 {
   454  		// No need to handle partial writes thus far.
   455  		return 0, linuxerr.EINVAL
   456  	}
   457  	if src.NumBytes() == 0 {
   458  		return 0, nil
   459  	}
   460  
   461  	// Limit input size so as not to impact performance if input size is
   462  	// large.
   463  	src = src.TakeFirst(hostarch.PageSize - 1)
   464  
   465  	ports := make([]int32, 2)
   466  	n, err := usermem.CopyInt32StringsInVec(ctx, src.IO, src.Addrs, ports, src.Opts)
   467  	if err != nil {
   468  		return 0, err
   469  	}
   470  
   471  	// Port numbers must be uint16s.
   472  	if ports[0] < 0 || ports[1] < 0 || ports[0] > math.MaxUint16 || ports[1] > math.MaxUint16 {
   473  		return 0, linuxerr.EINVAL
   474  	}
   475  
   476  	if err := pr.stack.SetPortRange(uint16(ports[0]), uint16(ports[1])); err != nil {
   477  		return 0, err
   478  	}
   479  	if pr.start == nil {
   480  		pr.start = new(uint16)
   481  		pr.end = new(uint16)
   482  	}
   483  	*pr.start = uint16(ports[0])
   484  	*pr.end = uint16(ports[1])
   485  	return n, nil
   486  }