github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/fsimpl/proc/tasks_sys.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package proc
    16  
    17  import (
    18  	"bytes"
    19  	"fmt"
    20  	"math"
    21  
    22  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    23  	"github.com/SagerNet/gvisor/pkg/context"
    24  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    25  	"github.com/SagerNet/gvisor/pkg/hostarch"
    26  	"github.com/SagerNet/gvisor/pkg/sentry/fsimpl/kernfs"
    27  	"github.com/SagerNet/gvisor/pkg/sentry/inet"
    28  	"github.com/SagerNet/gvisor/pkg/sentry/kernel"
    29  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/auth"
    30  	"github.com/SagerNet/gvisor/pkg/sentry/vfs"
    31  	"github.com/SagerNet/gvisor/pkg/sync"
    32  	"github.com/SagerNet/gvisor/pkg/tcpip/network/ipv4"
    33  	"github.com/SagerNet/gvisor/pkg/usermem"
    34  )
    35  
    36  // +stateify savable
    37  type tcpMemDir int
    38  
    39  const (
    40  	tcpRMem tcpMemDir = iota
    41  	tcpWMem
    42  )
    43  
    44  // newSysDir returns the dentry corresponding to /proc/sys directory.
    45  func (fs *filesystem) newSysDir(ctx context.Context, root *auth.Credentials, k *kernel.Kernel) kernfs.Inode {
    46  	return fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
    47  		"kernel": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
    48  			"hostname": fs.newInode(ctx, root, 0444, &hostnameData{}),
    49  			"sem":      fs.newInode(ctx, root, 0444, newStaticFile(fmt.Sprintf("%d\t%d\t%d\t%d\n", linux.SEMMSL, linux.SEMMNS, linux.SEMOPM, linux.SEMMNI))),
    50  			"shmall":   fs.newInode(ctx, root, 0444, shmData(linux.SHMALL)),
    51  			"shmmax":   fs.newInode(ctx, root, 0444, shmData(linux.SHMMAX)),
    52  			"shmmni":   fs.newInode(ctx, root, 0444, shmData(linux.SHMMNI)),
    53  			"yama": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
    54  				"ptrace_scope": fs.newYAMAPtraceScopeFile(ctx, k, root),
    55  			}),
    56  		}),
    57  		"vm": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
    58  			"max_map_count":     fs.newInode(ctx, root, 0444, newStaticFile("2147483647\n")),
    59  			"mmap_min_addr":     fs.newInode(ctx, root, 0444, &mmapMinAddrData{k: k}),
    60  			"overcommit_memory": fs.newInode(ctx, root, 0444, newStaticFile("0\n")),
    61  		}),
    62  		"net": fs.newSysNetDir(ctx, root, k),
    63  	})
    64  }
    65  
    66  // newSysNetDir returns the dentry corresponding to /proc/sys/net directory.
    67  func (fs *filesystem) newSysNetDir(ctx context.Context, root *auth.Credentials, k *kernel.Kernel) kernfs.Inode {
    68  	var contents map[string]kernfs.Inode
    69  
    70  	// TODO(github.com/SagerNet/issue/1833): Support for using the network stack in the
    71  	// network namespace of the calling process.
    72  	if stack := k.RootNetworkNamespace().Stack(); stack != nil {
    73  		contents = map[string]kernfs.Inode{
    74  			"ipv4": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
    75  				"ip_forward":          fs.newInode(ctx, root, 0444, &ipForwarding{stack: stack}),
    76  				"ip_local_port_range": fs.newInode(ctx, root, 0644, &portRange{stack: stack}),
    77  				"tcp_recovery":        fs.newInode(ctx, root, 0644, &tcpRecoveryData{stack: stack}),
    78  				"tcp_rmem":            fs.newInode(ctx, root, 0644, &tcpMemData{stack: stack, dir: tcpRMem}),
    79  				"tcp_sack":            fs.newInode(ctx, root, 0644, &tcpSackData{stack: stack}),
    80  				"tcp_wmem":            fs.newInode(ctx, root, 0644, &tcpMemData{stack: stack, dir: tcpWMem}),
    81  
    82  				// The following files are simple stubs until they are implemented in
    83  				// netstack, most of these files are configuration related. We use the
    84  				// value closest to the actual netstack behavior or any empty file, all
    85  				// of these files will have mode 0444 (read-only for all users).
    86  				"ip_local_reserved_ports": fs.newInode(ctx, root, 0444, newStaticFile("")),
    87  				"ipfrag_time":             fs.newInode(ctx, root, 0444, newStaticFile("30")),
    88  				"ip_nonlocal_bind":        fs.newInode(ctx, root, 0444, newStaticFile("0")),
    89  				"ip_no_pmtu_disc":         fs.newInode(ctx, root, 0444, newStaticFile("1")),
    90  
    91  				// tcp_allowed_congestion_control tell the user what they are able to
    92  				// do as an unprivledged process so we leave it empty.
    93  				"tcp_allowed_congestion_control":   fs.newInode(ctx, root, 0444, newStaticFile("")),
    94  				"tcp_available_congestion_control": fs.newInode(ctx, root, 0444, newStaticFile("reno")),
    95  				"tcp_congestion_control":           fs.newInode(ctx, root, 0444, newStaticFile("reno")),
    96  
    97  				// Many of the following stub files are features netstack doesn't
    98  				// support. The unsupported features return "0" to indicate they are
    99  				// disabled.
   100  				"tcp_base_mss":              fs.newInode(ctx, root, 0444, newStaticFile("1280")),
   101  				"tcp_dsack":                 fs.newInode(ctx, root, 0444, newStaticFile("0")),
   102  				"tcp_early_retrans":         fs.newInode(ctx, root, 0444, newStaticFile("0")),
   103  				"tcp_fack":                  fs.newInode(ctx, root, 0444, newStaticFile("0")),
   104  				"tcp_fastopen":              fs.newInode(ctx, root, 0444, newStaticFile("0")),
   105  				"tcp_fastopen_key":          fs.newInode(ctx, root, 0444, newStaticFile("")),
   106  				"tcp_invalid_ratelimit":     fs.newInode(ctx, root, 0444, newStaticFile("0")),
   107  				"tcp_keepalive_intvl":       fs.newInode(ctx, root, 0444, newStaticFile("0")),
   108  				"tcp_keepalive_probes":      fs.newInode(ctx, root, 0444, newStaticFile("0")),
   109  				"tcp_keepalive_time":        fs.newInode(ctx, root, 0444, newStaticFile("7200")),
   110  				"tcp_mtu_probing":           fs.newInode(ctx, root, 0444, newStaticFile("0")),
   111  				"tcp_no_metrics_save":       fs.newInode(ctx, root, 0444, newStaticFile("1")),
   112  				"tcp_probe_interval":        fs.newInode(ctx, root, 0444, newStaticFile("0")),
   113  				"tcp_probe_threshold":       fs.newInode(ctx, root, 0444, newStaticFile("0")),
   114  				"tcp_retries1":              fs.newInode(ctx, root, 0444, newStaticFile("3")),
   115  				"tcp_retries2":              fs.newInode(ctx, root, 0444, newStaticFile("15")),
   116  				"tcp_rfc1337":               fs.newInode(ctx, root, 0444, newStaticFile("1")),
   117  				"tcp_slow_start_after_idle": fs.newInode(ctx, root, 0444, newStaticFile("1")),
   118  				"tcp_synack_retries":        fs.newInode(ctx, root, 0444, newStaticFile("5")),
   119  				"tcp_syn_retries":           fs.newInode(ctx, root, 0444, newStaticFile("3")),
   120  				"tcp_timestamps":            fs.newInode(ctx, root, 0444, newStaticFile("1")),
   121  			}),
   122  			"core": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
   123  				"default_qdisc": fs.newInode(ctx, root, 0444, newStaticFile("pfifo_fast")),
   124  				"message_burst": fs.newInode(ctx, root, 0444, newStaticFile("10")),
   125  				"message_cost":  fs.newInode(ctx, root, 0444, newStaticFile("5")),
   126  				"optmem_max":    fs.newInode(ctx, root, 0444, newStaticFile("0")),
   127  				"rmem_default":  fs.newInode(ctx, root, 0444, newStaticFile("212992")),
   128  				"rmem_max":      fs.newInode(ctx, root, 0444, newStaticFile("212992")),
   129  				"somaxconn":     fs.newInode(ctx, root, 0444, newStaticFile("128")),
   130  				"wmem_default":  fs.newInode(ctx, root, 0444, newStaticFile("212992")),
   131  				"wmem_max":      fs.newInode(ctx, root, 0444, newStaticFile("212992")),
   132  			}),
   133  		}
   134  	}
   135  
   136  	return fs.newStaticDir(ctx, root, contents)
   137  }
   138  
   139  // mmapMinAddrData implements vfs.DynamicBytesSource for
   140  // /proc/sys/vm/mmap_min_addr.
   141  //
   142  // +stateify savable
   143  type mmapMinAddrData struct {
   144  	kernfs.DynamicBytesFile
   145  
   146  	k *kernel.Kernel
   147  }
   148  
   149  var _ dynamicInode = (*mmapMinAddrData)(nil)
   150  
   151  // Generate implements vfs.DynamicBytesSource.Generate.
   152  func (d *mmapMinAddrData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   153  	fmt.Fprintf(buf, "%d\n", d.k.Platform.MinUserAddress())
   154  	return nil
   155  }
   156  
   157  // hostnameData implements vfs.DynamicBytesSource for /proc/sys/kernel/hostname.
   158  //
   159  // +stateify savable
   160  type hostnameData struct {
   161  	kernfs.DynamicBytesFile
   162  }
   163  
   164  var _ dynamicInode = (*hostnameData)(nil)
   165  
   166  // Generate implements vfs.DynamicBytesSource.Generate.
   167  func (*hostnameData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   168  	utsns := kernel.UTSNamespaceFromContext(ctx)
   169  	buf.WriteString(utsns.HostName())
   170  	buf.WriteString("\n")
   171  	return nil
   172  }
   173  
   174  // tcpSackData implements vfs.WritableDynamicBytesSource for
   175  // /proc/sys/net/tcp_sack.
   176  //
   177  // +stateify savable
   178  type tcpSackData struct {
   179  	kernfs.DynamicBytesFile
   180  
   181  	stack   inet.Stack `state:"wait"`
   182  	enabled *bool
   183  }
   184  
   185  var _ vfs.WritableDynamicBytesSource = (*tcpSackData)(nil)
   186  
   187  // Generate implements vfs.DynamicBytesSource.Generate.
   188  func (d *tcpSackData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   189  	if d.enabled == nil {
   190  		sack, err := d.stack.TCPSACKEnabled()
   191  		if err != nil {
   192  			return err
   193  		}
   194  		d.enabled = &sack
   195  	}
   196  
   197  	val := "0\n"
   198  	if *d.enabled {
   199  		// Technically, this is not quite compatible with Linux. Linux stores these
   200  		// as an integer, so if you write "2" into tcp_sack, you should get 2 back.
   201  		// Tough luck.
   202  		val = "1\n"
   203  	}
   204  	_, err := buf.WriteString(val)
   205  	return err
   206  }
   207  
   208  // Write implements vfs.WritableDynamicBytesSource.Write.
   209  func (d *tcpSackData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
   210  	if offset != 0 {
   211  		// No need to handle partial writes thus far.
   212  		return 0, linuxerr.EINVAL
   213  	}
   214  	if src.NumBytes() == 0 {
   215  		return 0, nil
   216  	}
   217  
   218  	// Limit the amount of memory allocated.
   219  	src = src.TakeFirst(hostarch.PageSize - 1)
   220  
   221  	var v int32
   222  	n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
   223  	if err != nil {
   224  		return 0, err
   225  	}
   226  	if d.enabled == nil {
   227  		d.enabled = new(bool)
   228  	}
   229  	*d.enabled = v != 0
   230  	return n, d.stack.SetTCPSACKEnabled(*d.enabled)
   231  }
   232  
   233  // tcpRecoveryData implements vfs.WritableDynamicBytesSource for
   234  // /proc/sys/net/ipv4/tcp_recovery.
   235  //
   236  // +stateify savable
   237  type tcpRecoveryData struct {
   238  	kernfs.DynamicBytesFile
   239  
   240  	stack inet.Stack `state:"wait"`
   241  }
   242  
   243  var _ vfs.WritableDynamicBytesSource = (*tcpRecoveryData)(nil)
   244  
   245  // Generate implements vfs.DynamicBytesSource.Generate.
   246  func (d *tcpRecoveryData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   247  	recovery, err := d.stack.TCPRecovery()
   248  	if err != nil {
   249  		return err
   250  	}
   251  
   252  	_, err = buf.WriteString(fmt.Sprintf("%d\n", recovery))
   253  	return err
   254  }
   255  
   256  // Write implements vfs.WritableDynamicBytesSource.Write.
   257  func (d *tcpRecoveryData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
   258  	if offset != 0 {
   259  		// No need to handle partial writes thus far.
   260  		return 0, linuxerr.EINVAL
   261  	}
   262  	if src.NumBytes() == 0 {
   263  		return 0, nil
   264  	}
   265  
   266  	// Limit the amount of memory allocated.
   267  	src = src.TakeFirst(hostarch.PageSize - 1)
   268  
   269  	var v int32
   270  	n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
   271  	if err != nil {
   272  		return 0, err
   273  	}
   274  	if err := d.stack.SetTCPRecovery(inet.TCPLossRecovery(v)); err != nil {
   275  		return 0, err
   276  	}
   277  	return n, nil
   278  }
   279  
   280  // tcpMemData implements vfs.WritableDynamicBytesSource for
   281  // /proc/sys/net/ipv4/tcp_rmem and /proc/sys/net/ipv4/tcp_wmem.
   282  //
   283  // +stateify savable
   284  type tcpMemData struct {
   285  	kernfs.DynamicBytesFile
   286  
   287  	dir   tcpMemDir
   288  	stack inet.Stack `state:"wait"`
   289  
   290  	// mu protects against concurrent reads/writes to FDs based on the dentry
   291  	// backing this byte source.
   292  	mu sync.Mutex `state:"nosave"`
   293  }
   294  
   295  var _ vfs.WritableDynamicBytesSource = (*tcpMemData)(nil)
   296  
   297  // Generate implements vfs.DynamicBytesSource.Generate.
   298  func (d *tcpMemData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   299  	d.mu.Lock()
   300  	defer d.mu.Unlock()
   301  
   302  	size, err := d.readSizeLocked()
   303  	if err != nil {
   304  		return err
   305  	}
   306  	_, err = buf.WriteString(fmt.Sprintf("%d\t%d\t%d\n", size.Min, size.Default, size.Max))
   307  	return err
   308  }
   309  
   310  // Write implements vfs.WritableDynamicBytesSource.Write.
   311  func (d *tcpMemData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
   312  	if offset != 0 {
   313  		// No need to handle partial writes thus far.
   314  		return 0, linuxerr.EINVAL
   315  	}
   316  	if src.NumBytes() == 0 {
   317  		return 0, nil
   318  	}
   319  	d.mu.Lock()
   320  	defer d.mu.Unlock()
   321  
   322  	// Limit the amount of memory allocated.
   323  	src = src.TakeFirst(hostarch.PageSize - 1)
   324  	size, err := d.readSizeLocked()
   325  	if err != nil {
   326  		return 0, err
   327  	}
   328  	buf := []int32{int32(size.Min), int32(size.Default), int32(size.Max)}
   329  	n, err := usermem.CopyInt32StringsInVec(ctx, src.IO, src.Addrs, buf, src.Opts)
   330  	if err != nil {
   331  		return 0, err
   332  	}
   333  	newSize := inet.TCPBufferSize{
   334  		Min:     int(buf[0]),
   335  		Default: int(buf[1]),
   336  		Max:     int(buf[2]),
   337  	}
   338  	if err := d.writeSizeLocked(newSize); err != nil {
   339  		return 0, err
   340  	}
   341  	return n, nil
   342  }
   343  
   344  // Precondition: d.mu must be locked.
   345  func (d *tcpMemData) readSizeLocked() (inet.TCPBufferSize, error) {
   346  	switch d.dir {
   347  	case tcpRMem:
   348  		return d.stack.TCPReceiveBufferSize()
   349  	case tcpWMem:
   350  		return d.stack.TCPSendBufferSize()
   351  	default:
   352  		panic(fmt.Sprintf("unknown tcpMemFile type: %v", d.dir))
   353  	}
   354  }
   355  
   356  // Precondition: d.mu must be locked.
   357  func (d *tcpMemData) writeSizeLocked(size inet.TCPBufferSize) error {
   358  	switch d.dir {
   359  	case tcpRMem:
   360  		return d.stack.SetTCPReceiveBufferSize(size)
   361  	case tcpWMem:
   362  		return d.stack.SetTCPSendBufferSize(size)
   363  	default:
   364  		panic(fmt.Sprintf("unknown tcpMemFile type: %v", d.dir))
   365  	}
   366  }
   367  
   368  // ipForwarding implements vfs.WritableDynamicBytesSource for
   369  // /proc/sys/net/ipv4/ip_forward.
   370  //
   371  // +stateify savable
   372  type ipForwarding struct {
   373  	kernfs.DynamicBytesFile
   374  
   375  	stack   inet.Stack `state:"wait"`
   376  	enabled bool
   377  }
   378  
   379  var _ vfs.WritableDynamicBytesSource = (*ipForwarding)(nil)
   380  
   381  // Generate implements vfs.DynamicBytesSource.Generate.
   382  func (ipf *ipForwarding) Generate(ctx context.Context, buf *bytes.Buffer) error {
   383  	val := "0\n"
   384  	if ipf.enabled {
   385  		// Technically, this is not quite compatible with Linux. Linux stores these
   386  		// as an integer, so if you write "2" into tcp_sack, you should get 2 back.
   387  		// Tough luck.
   388  		val = "1\n"
   389  	}
   390  	buf.WriteString(val)
   391  
   392  	return nil
   393  }
   394  
   395  // Write implements vfs.WritableDynamicBytesSource.Write.
   396  func (ipf *ipForwarding) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
   397  	if offset != 0 {
   398  		// No need to handle partial writes thus far.
   399  		return 0, linuxerr.EINVAL
   400  	}
   401  	if src.NumBytes() == 0 {
   402  		return 0, nil
   403  	}
   404  
   405  	// Limit input size so as not to impact performance if input size is large.
   406  	src = src.TakeFirst(hostarch.PageSize - 1)
   407  
   408  	var v int32
   409  	n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
   410  	if err != nil {
   411  		return 0, err
   412  	}
   413  	ipf.enabled = v != 0
   414  	if err := ipf.stack.SetForwarding(ipv4.ProtocolNumber, ipf.enabled); err != nil {
   415  		return 0, err
   416  	}
   417  	return n, nil
   418  }
   419  
   420  // portRange implements vfs.WritableDynamicBytesSource for
   421  // /proc/sys/net/ipv4/ip_local_port_range.
   422  //
   423  // +stateify savable
   424  type portRange struct {
   425  	kernfs.DynamicBytesFile
   426  
   427  	stack inet.Stack `state:"wait"`
   428  
   429  	// start and end store the port range. We must save/restore this here,
   430  	// since a netstack instance is created on restore.
   431  	start *uint16
   432  	end   *uint16
   433  }
   434  
   435  var _ vfs.WritableDynamicBytesSource = (*portRange)(nil)
   436  
   437  // Generate implements vfs.DynamicBytesSource.Generate.
   438  func (pr *portRange) Generate(ctx context.Context, buf *bytes.Buffer) error {
   439  	if pr.start == nil {
   440  		start, end := pr.stack.PortRange()
   441  		pr.start = &start
   442  		pr.end = &end
   443  	}
   444  	_, err := fmt.Fprintf(buf, "%d %d\n", *pr.start, *pr.end)
   445  	return err
   446  }
   447  
   448  // Write implements vfs.WritableDynamicBytesSource.Write.
   449  func (pr *portRange) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
   450  	if offset != 0 {
   451  		// No need to handle partial writes thus far.
   452  		return 0, linuxerr.EINVAL
   453  	}
   454  	if src.NumBytes() == 0 {
   455  		return 0, nil
   456  	}
   457  
   458  	// Limit input size so as not to impact performance if input size is
   459  	// large.
   460  	src = src.TakeFirst(hostarch.PageSize - 1)
   461  
   462  	ports := make([]int32, 2)
   463  	n, err := usermem.CopyInt32StringsInVec(ctx, src.IO, src.Addrs, ports, src.Opts)
   464  	if err != nil {
   465  		return 0, err
   466  	}
   467  
   468  	// Port numbers must be uint16s.
   469  	if ports[0] < 0 || ports[1] < 0 || ports[0] > math.MaxUint16 || ports[1] > math.MaxUint16 {
   470  		return 0, linuxerr.EINVAL
   471  	}
   472  
   473  	if err := pr.stack.SetPortRange(uint16(ports[0]), uint16(ports[1])); err != nil {
   474  		return 0, err
   475  	}
   476  	if pr.start == nil {
   477  		pr.start = new(uint16)
   478  		pr.end = new(uint16)
   479  	}
   480  	*pr.start = uint16(ports[0])
   481  	*pr.end = uint16(ports[1])
   482  	return n, nil
   483  }