github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/fsimpl/proc/tasks_sys.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package proc
    16  
    17  import (
    18  	"bytes"
    19  	"fmt"
    20  	"math"
    21  
    22  	"github.com/MerlinKodo/gvisor/pkg/abi/linux"
    23  	"github.com/MerlinKodo/gvisor/pkg/context"
    24  	"github.com/MerlinKodo/gvisor/pkg/errors/linuxerr"
    25  	"github.com/MerlinKodo/gvisor/pkg/hostarch"
    26  	"github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/kernfs"
    27  	"github.com/MerlinKodo/gvisor/pkg/sentry/inet"
    28  	"github.com/MerlinKodo/gvisor/pkg/sentry/kernel"
    29  	"github.com/MerlinKodo/gvisor/pkg/sentry/kernel/auth"
    30  	"github.com/MerlinKodo/gvisor/pkg/sentry/vfs"
    31  	"github.com/MerlinKodo/gvisor/pkg/sync"
    32  	"github.com/MerlinKodo/gvisor/pkg/tcpip/network/ipv4"
    33  	"github.com/MerlinKodo/gvisor/pkg/usermem"
    34  )
    35  
    36  // +stateify savable
    37  type tcpMemDir int
    38  
    39  const (
    40  	tcpRMem tcpMemDir = iota
    41  	tcpWMem
    42  )
    43  
    44  // newSysDir returns the dentry corresponding to /proc/sys directory.
    45  func (fs *filesystem) newSysDir(ctx context.Context, root *auth.Credentials, k *kernel.Kernel) kernfs.Inode {
    46  	return fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
    47  		"kernel": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
    48  			"cap_last_cap": fs.newInode(ctx, root, 0444, newStaticFile(fmt.Sprintf("%d\n", linux.CAP_LAST_CAP))),
    49  			"hostname":     fs.newInode(ctx, root, 0444, &hostnameData{}),
    50  			"sem":          fs.newInode(ctx, root, 0444, newStaticFile(fmt.Sprintf("%d\t%d\t%d\t%d\n", linux.SEMMSL, linux.SEMMNS, linux.SEMOPM, linux.SEMMNI))),
    51  			"shmall":       fs.newInode(ctx, root, 0444, ipcData(linux.SHMALL)),
    52  			"shmmax":       fs.newInode(ctx, root, 0444, ipcData(linux.SHMMAX)),
    53  			"shmmni":       fs.newInode(ctx, root, 0444, ipcData(linux.SHMMNI)),
    54  			"msgmni":       fs.newInode(ctx, root, 0444, ipcData(linux.MSGMNI)),
    55  			"msgmax":       fs.newInode(ctx, root, 0444, ipcData(linux.MSGMAX)),
    56  			"msgmnb":       fs.newInode(ctx, root, 0444, ipcData(linux.MSGMNB)),
    57  			"yama": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
    58  				"ptrace_scope": fs.newYAMAPtraceScopeFile(ctx, k, root),
    59  			}),
    60  		}),
    61  		"vm": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
    62  			"max_map_count":     fs.newInode(ctx, root, 0444, newStaticFile("2147483647\n")),
    63  			"mmap_min_addr":     fs.newInode(ctx, root, 0444, &mmapMinAddrData{k: k}),
    64  			"overcommit_memory": fs.newInode(ctx, root, 0444, newStaticFile("0\n")),
    65  		}),
    66  		"net": fs.newSysNetDir(ctx, root, k),
    67  	})
    68  }
    69  
    70  // newSysNetDir returns the dentry corresponding to /proc/sys/net directory.
    71  func (fs *filesystem) newSysNetDir(ctx context.Context, root *auth.Credentials, k *kernel.Kernel) kernfs.Inode {
    72  	var contents map[string]kernfs.Inode
    73  
    74  	// TODO(gvisor.dev/issue/1833): Support for using the network stack in the
    75  	// network namespace of the calling process.
    76  	if stack := k.RootNetworkNamespace().Stack(); stack != nil {
    77  		contents = map[string]kernfs.Inode{
    78  			"ipv4": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
    79  				"ip_forward":          fs.newInode(ctx, root, 0444, &ipForwarding{stack: stack}),
    80  				"ip_local_port_range": fs.newInode(ctx, root, 0644, &portRange{stack: stack}),
    81  				"tcp_recovery":        fs.newInode(ctx, root, 0644, &tcpRecoveryData{stack: stack}),
    82  				"tcp_rmem":            fs.newInode(ctx, root, 0644, &tcpMemData{stack: stack, dir: tcpRMem}),
    83  				"tcp_sack":            fs.newInode(ctx, root, 0644, &tcpSackData{stack: stack}),
    84  				"tcp_wmem":            fs.newInode(ctx, root, 0644, &tcpMemData{stack: stack, dir: tcpWMem}),
    85  
    86  				// The following files are simple stubs until they are implemented in
    87  				// netstack, most of these files are configuration related. We use the
    88  				// value closest to the actual netstack behavior or any empty file, all
    89  				// of these files will have mode 0444 (read-only for all users).
    90  				"ip_local_reserved_ports": fs.newInode(ctx, root, 0444, newStaticFile("")),
    91  				"ipfrag_time":             fs.newInode(ctx, root, 0444, newStaticFile("30")),
    92  				"ip_nonlocal_bind":        fs.newInode(ctx, root, 0444, newStaticFile("0")),
    93  				"ip_no_pmtu_disc":         fs.newInode(ctx, root, 0444, newStaticFile("1")),
    94  
    95  				// tcp_allowed_congestion_control tell the user what they are able to
    96  				// do as an unprivledged process so we leave it empty.
    97  				"tcp_allowed_congestion_control":   fs.newInode(ctx, root, 0444, newStaticFile("")),
    98  				"tcp_available_congestion_control": fs.newInode(ctx, root, 0444, newStaticFile("reno")),
    99  				"tcp_congestion_control":           fs.newInode(ctx, root, 0444, newStaticFile("reno")),
   100  
   101  				// Many of the following stub files are features netstack doesn't
   102  				// support. The unsupported features return "0" to indicate they are
   103  				// disabled.
   104  				"tcp_base_mss":              fs.newInode(ctx, root, 0444, newStaticFile("1280")),
   105  				"tcp_dsack":                 fs.newInode(ctx, root, 0444, newStaticFile("0")),
   106  				"tcp_early_retrans":         fs.newInode(ctx, root, 0444, newStaticFile("0")),
   107  				"tcp_fack":                  fs.newInode(ctx, root, 0444, newStaticFile("0")),
   108  				"tcp_fastopen":              fs.newInode(ctx, root, 0444, newStaticFile("0")),
   109  				"tcp_fastopen_key":          fs.newInode(ctx, root, 0444, newStaticFile("")),
   110  				"tcp_invalid_ratelimit":     fs.newInode(ctx, root, 0444, newStaticFile("0")),
   111  				"tcp_keepalive_intvl":       fs.newInode(ctx, root, 0444, newStaticFile("0")),
   112  				"tcp_keepalive_probes":      fs.newInode(ctx, root, 0444, newStaticFile("0")),
   113  				"tcp_keepalive_time":        fs.newInode(ctx, root, 0444, newStaticFile("7200")),
   114  				"tcp_mtu_probing":           fs.newInode(ctx, root, 0444, newStaticFile("0")),
   115  				"tcp_no_metrics_save":       fs.newInode(ctx, root, 0444, newStaticFile("1")),
   116  				"tcp_probe_interval":        fs.newInode(ctx, root, 0444, newStaticFile("0")),
   117  				"tcp_probe_threshold":       fs.newInode(ctx, root, 0444, newStaticFile("0")),
   118  				"tcp_retries1":              fs.newInode(ctx, root, 0444, newStaticFile("3")),
   119  				"tcp_retries2":              fs.newInode(ctx, root, 0444, newStaticFile("15")),
   120  				"tcp_rfc1337":               fs.newInode(ctx, root, 0444, newStaticFile("1")),
   121  				"tcp_slow_start_after_idle": fs.newInode(ctx, root, 0444, newStaticFile("1")),
   122  				"tcp_synack_retries":        fs.newInode(ctx, root, 0444, newStaticFile("5")),
   123  				"tcp_syn_retries":           fs.newInode(ctx, root, 0444, newStaticFile("3")),
   124  				"tcp_timestamps":            fs.newInode(ctx, root, 0444, newStaticFile("1")),
   125  			}),
   126  			"core": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
   127  				"default_qdisc": fs.newInode(ctx, root, 0444, newStaticFile("pfifo_fast")),
   128  				"message_burst": fs.newInode(ctx, root, 0444, newStaticFile("10")),
   129  				"message_cost":  fs.newInode(ctx, root, 0444, newStaticFile("5")),
   130  				"optmem_max":    fs.newInode(ctx, root, 0444, newStaticFile("0")),
   131  				"rmem_default":  fs.newInode(ctx, root, 0444, newStaticFile("212992")),
   132  				"rmem_max":      fs.newInode(ctx, root, 0444, newStaticFile("212992")),
   133  				"somaxconn":     fs.newInode(ctx, root, 0444, newStaticFile("128")),
   134  				"wmem_default":  fs.newInode(ctx, root, 0444, newStaticFile("212992")),
   135  				"wmem_max":      fs.newInode(ctx, root, 0444, newStaticFile("212992")),
   136  			}),
   137  		}
   138  	}
   139  
   140  	return fs.newStaticDir(ctx, root, contents)
   141  }
   142  
   143  // mmapMinAddrData implements vfs.DynamicBytesSource for
   144  // /proc/sys/vm/mmap_min_addr.
   145  //
   146  // +stateify savable
   147  type mmapMinAddrData struct {
   148  	kernfs.DynamicBytesFile
   149  
   150  	k *kernel.Kernel
   151  }
   152  
   153  var _ dynamicInode = (*mmapMinAddrData)(nil)
   154  
   155  // Generate implements vfs.DynamicBytesSource.Generate.
   156  func (d *mmapMinAddrData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   157  	fmt.Fprintf(buf, "%d\n", d.k.Platform.MinUserAddress())
   158  	return nil
   159  }
   160  
   161  // hostnameData implements vfs.DynamicBytesSource for /proc/sys/kernel/hostname.
   162  //
   163  // +stateify savable
   164  type hostnameData struct {
   165  	kernfs.DynamicBytesFile
   166  }
   167  
   168  var _ dynamicInode = (*hostnameData)(nil)
   169  
   170  // Generate implements vfs.DynamicBytesSource.Generate.
   171  func (*hostnameData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   172  	utsns := kernel.UTSNamespaceFromContext(ctx)
   173  	defer utsns.DecRef(ctx)
   174  	buf.WriteString(utsns.HostName())
   175  	buf.WriteString("\n")
   176  	return nil
   177  }
   178  
   179  // tcpSackData implements vfs.WritableDynamicBytesSource for
   180  // /proc/sys/net/tcp_sack.
   181  //
   182  // +stateify savable
   183  type tcpSackData struct {
   184  	kernfs.DynamicBytesFile
   185  
   186  	stack   inet.Stack `state:"wait"`
   187  	enabled *bool
   188  }
   189  
   190  var _ vfs.WritableDynamicBytesSource = (*tcpSackData)(nil)
   191  
   192  // Generate implements vfs.DynamicBytesSource.Generate.
   193  func (d *tcpSackData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   194  	if d.enabled == nil {
   195  		sack, err := d.stack.TCPSACKEnabled()
   196  		if err != nil {
   197  			return err
   198  		}
   199  		d.enabled = &sack
   200  	}
   201  
   202  	val := "0\n"
   203  	if *d.enabled {
   204  		// Technically, this is not quite compatible with Linux. Linux stores these
   205  		// as an integer, so if you write "2" into tcp_sack, you should get 2 back.
   206  		// Tough luck.
   207  		val = "1\n"
   208  	}
   209  	_, err := buf.WriteString(val)
   210  	return err
   211  }
   212  
   213  // Write implements vfs.WritableDynamicBytesSource.Write.
   214  func (d *tcpSackData) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) {
   215  	if offset != 0 {
   216  		// No need to handle partial writes thus far.
   217  		return 0, linuxerr.EINVAL
   218  	}
   219  	if src.NumBytes() == 0 {
   220  		return 0, nil
   221  	}
   222  
   223  	// Limit the amount of memory allocated.
   224  	src = src.TakeFirst(hostarch.PageSize - 1)
   225  
   226  	var v int32
   227  	n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
   228  	if err != nil {
   229  		return 0, err
   230  	}
   231  	if d.enabled == nil {
   232  		d.enabled = new(bool)
   233  	}
   234  	*d.enabled = v != 0
   235  	return n, d.stack.SetTCPSACKEnabled(*d.enabled)
   236  }
   237  
   238  // tcpRecoveryData implements vfs.WritableDynamicBytesSource for
   239  // /proc/sys/net/ipv4/tcp_recovery.
   240  //
   241  // +stateify savable
   242  type tcpRecoveryData struct {
   243  	kernfs.DynamicBytesFile
   244  
   245  	stack inet.Stack `state:"wait"`
   246  }
   247  
   248  var _ vfs.WritableDynamicBytesSource = (*tcpRecoveryData)(nil)
   249  
   250  // Generate implements vfs.DynamicBytesSource.Generate.
   251  func (d *tcpRecoveryData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   252  	recovery, err := d.stack.TCPRecovery()
   253  	if err != nil {
   254  		return err
   255  	}
   256  
   257  	_, err = buf.WriteString(fmt.Sprintf("%d\n", recovery))
   258  	return err
   259  }
   260  
   261  // Write implements vfs.WritableDynamicBytesSource.Write.
   262  func (d *tcpRecoveryData) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) {
   263  	if offset != 0 {
   264  		// No need to handle partial writes thus far.
   265  		return 0, linuxerr.EINVAL
   266  	}
   267  	if src.NumBytes() == 0 {
   268  		return 0, nil
   269  	}
   270  
   271  	// Limit the amount of memory allocated.
   272  	src = src.TakeFirst(hostarch.PageSize - 1)
   273  
   274  	var v int32
   275  	n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
   276  	if err != nil {
   277  		return 0, err
   278  	}
   279  	if err := d.stack.SetTCPRecovery(inet.TCPLossRecovery(v)); err != nil {
   280  		return 0, err
   281  	}
   282  	return n, nil
   283  }
   284  
   285  // tcpMemData implements vfs.WritableDynamicBytesSource for
   286  // /proc/sys/net/ipv4/tcp_rmem and /proc/sys/net/ipv4/tcp_wmem.
   287  //
   288  // +stateify savable
   289  type tcpMemData struct {
   290  	kernfs.DynamicBytesFile
   291  
   292  	dir   tcpMemDir
   293  	stack inet.Stack `state:"wait"`
   294  
   295  	// mu protects against concurrent reads/writes to FDs based on the dentry
   296  	// backing this byte source.
   297  	mu sync.Mutex `state:"nosave"`
   298  }
   299  
   300  var _ vfs.WritableDynamicBytesSource = (*tcpMemData)(nil)
   301  
   302  // Generate implements vfs.DynamicBytesSource.Generate.
   303  func (d *tcpMemData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   304  	d.mu.Lock()
   305  	defer d.mu.Unlock()
   306  
   307  	size, err := d.readSizeLocked()
   308  	if err != nil {
   309  		return err
   310  	}
   311  	_, err = buf.WriteString(fmt.Sprintf("%d\t%d\t%d\n", size.Min, size.Default, size.Max))
   312  	return err
   313  }
   314  
   315  // Write implements vfs.WritableDynamicBytesSource.Write.
   316  func (d *tcpMemData) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) {
   317  	if offset != 0 {
   318  		// No need to handle partial writes thus far.
   319  		return 0, linuxerr.EINVAL
   320  	}
   321  	if src.NumBytes() == 0 {
   322  		return 0, nil
   323  	}
   324  	d.mu.Lock()
   325  	defer d.mu.Unlock()
   326  
   327  	// Limit the amount of memory allocated.
   328  	src = src.TakeFirst(hostarch.PageSize - 1)
   329  	size, err := d.readSizeLocked()
   330  	if err != nil {
   331  		return 0, err
   332  	}
   333  	buf := []int32{int32(size.Min), int32(size.Default), int32(size.Max)}
   334  	n, err := usermem.CopyInt32StringsInVec(ctx, src.IO, src.Addrs, buf, src.Opts)
   335  	if err != nil {
   336  		return 0, err
   337  	}
   338  	newSize := inet.TCPBufferSize{
   339  		Min:     int(buf[0]),
   340  		Default: int(buf[1]),
   341  		Max:     int(buf[2]),
   342  	}
   343  	if err := d.writeSizeLocked(newSize); err != nil {
   344  		return 0, err
   345  	}
   346  	return n, nil
   347  }
   348  
   349  // Precondition: d.mu must be locked.
   350  func (d *tcpMemData) readSizeLocked() (inet.TCPBufferSize, error) {
   351  	switch d.dir {
   352  	case tcpRMem:
   353  		return d.stack.TCPReceiveBufferSize()
   354  	case tcpWMem:
   355  		return d.stack.TCPSendBufferSize()
   356  	default:
   357  		panic(fmt.Sprintf("unknown tcpMemFile type: %v", d.dir))
   358  	}
   359  }
   360  
   361  // Precondition: d.mu must be locked.
   362  func (d *tcpMemData) writeSizeLocked(size inet.TCPBufferSize) error {
   363  	switch d.dir {
   364  	case tcpRMem:
   365  		return d.stack.SetTCPReceiveBufferSize(size)
   366  	case tcpWMem:
   367  		return d.stack.SetTCPSendBufferSize(size)
   368  	default:
   369  		panic(fmt.Sprintf("unknown tcpMemFile type: %v", d.dir))
   370  	}
   371  }
   372  
   373  // ipForwarding implements vfs.WritableDynamicBytesSource for
   374  // /proc/sys/net/ipv4/ip_forward.
   375  //
   376  // +stateify savable
   377  type ipForwarding struct {
   378  	kernfs.DynamicBytesFile
   379  
   380  	stack   inet.Stack `state:"wait"`
   381  	enabled bool
   382  }
   383  
   384  var _ vfs.WritableDynamicBytesSource = (*ipForwarding)(nil)
   385  
   386  // Generate implements vfs.DynamicBytesSource.Generate.
   387  func (ipf *ipForwarding) Generate(ctx context.Context, buf *bytes.Buffer) error {
   388  	val := "0\n"
   389  	if ipf.enabled {
   390  		// Technically, this is not quite compatible with Linux. Linux stores these
   391  		// as an integer, so if you write "2" into tcp_sack, you should get 2 back.
   392  		// Tough luck.
   393  		val = "1\n"
   394  	}
   395  	buf.WriteString(val)
   396  
   397  	return nil
   398  }
   399  
   400  // Write implements vfs.WritableDynamicBytesSource.Write.
   401  func (ipf *ipForwarding) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) {
   402  	if offset != 0 {
   403  		// No need to handle partial writes thus far.
   404  		return 0, linuxerr.EINVAL
   405  	}
   406  	if src.NumBytes() == 0 {
   407  		return 0, nil
   408  	}
   409  
   410  	// Limit input size so as not to impact performance if input size is large.
   411  	src = src.TakeFirst(hostarch.PageSize - 1)
   412  
   413  	var v int32
   414  	n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
   415  	if err != nil {
   416  		return 0, err
   417  	}
   418  	ipf.enabled = v != 0
   419  	if err := ipf.stack.SetForwarding(ipv4.ProtocolNumber, ipf.enabled); err != nil {
   420  		return 0, err
   421  	}
   422  	return n, nil
   423  }
   424  
   425  // portRange implements vfs.WritableDynamicBytesSource for
   426  // /proc/sys/net/ipv4/ip_local_port_range.
   427  //
   428  // +stateify savable
   429  type portRange struct {
   430  	kernfs.DynamicBytesFile
   431  
   432  	stack inet.Stack `state:"wait"`
   433  
   434  	// start and end store the port range. We must save/restore this here,
   435  	// since a netstack instance is created on restore.
   436  	start *uint16
   437  	end   *uint16
   438  }
   439  
   440  var _ vfs.WritableDynamicBytesSource = (*portRange)(nil)
   441  
   442  // Generate implements vfs.DynamicBytesSource.Generate.
   443  func (pr *portRange) Generate(ctx context.Context, buf *bytes.Buffer) error {
   444  	if pr.start == nil {
   445  		start, end := pr.stack.PortRange()
   446  		pr.start = &start
   447  		pr.end = &end
   448  	}
   449  	_, err := fmt.Fprintf(buf, "%d %d\n", *pr.start, *pr.end)
   450  	return err
   451  }
   452  
   453  // Write implements vfs.WritableDynamicBytesSource.Write.
   454  func (pr *portRange) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) {
   455  	if offset != 0 {
   456  		// No need to handle partial writes thus far.
   457  		return 0, linuxerr.EINVAL
   458  	}
   459  	if src.NumBytes() == 0 {
   460  		return 0, nil
   461  	}
   462  
   463  	// Limit input size so as not to impact performance if input size is
   464  	// large.
   465  	src = src.TakeFirst(hostarch.PageSize - 1)
   466  
   467  	ports := make([]int32, 2)
   468  	n, err := usermem.CopyInt32StringsInVec(ctx, src.IO, src.Addrs, ports, src.Opts)
   469  	if err != nil {
   470  		return 0, err
   471  	}
   472  
   473  	// Port numbers must be uint16s.
   474  	if ports[0] < 0 || ports[1] < 0 || ports[0] > math.MaxUint16 || ports[1] > math.MaxUint16 {
   475  		return 0, linuxerr.EINVAL
   476  	}
   477  
   478  	if err := pr.stack.SetPortRange(uint16(ports[0]), uint16(ports[1])); err != nil {
   479  		return 0, err
   480  	}
   481  	if pr.start == nil {
   482  		pr.start = new(uint16)
   483  		pr.end = new(uint16)
   484  	}
   485  	*pr.start = uint16(ports[0])
   486  	*pr.end = uint16(ports[1])
   487  	return n, nil
   488  }