github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/syscalls/linux/sys_mempolicy.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package linux
    16  
    17  import (
    18  	"fmt"
    19  
    20  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    21  	"github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr"
    22  	"github.com/nicocha30/gvisor-ligolo/pkg/hostarch"
    23  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/arch"
    24  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel"
    25  	"github.com/nicocha30/gvisor-ligolo/pkg/usermem"
    26  )
    27  
    28  // We unconditionally report a single NUMA node. This also means that our
    29  // "nodemask_t" is a single unsigned long (uint64).
    30  const (
    31  	maxNodes        = 1
    32  	allowedNodemask = (1 << maxNodes) - 1
    33  )
    34  
    35  func copyInNodemask(t *kernel.Task, addr hostarch.Addr, maxnode uint32) (uint64, error) {
    36  	// "nodemask points to a bit mask of node IDs that contains up to maxnode
    37  	// bits. The bit mask size is rounded to the next multiple of
    38  	// sizeof(unsigned long), but the kernel will use bits only up to maxnode.
    39  	// A NULL value of nodemask or a maxnode value of zero specifies the empty
    40  	// set of nodes. If the value of maxnode is zero, the nodemask argument is
    41  	// ignored." - set_mempolicy(2). Unfortunately, most of this is inaccurate
    42  	// because of what appears to be a bug: mm/mempolicy.c:get_nodes() uses
    43  	// maxnode-1, not maxnode, as the number of bits.
    44  	bits := maxnode - 1
    45  	if bits > hostarch.PageSize*8 { // also handles overflow from maxnode == 0
    46  		return 0, linuxerr.EINVAL
    47  	}
    48  	if bits == 0 {
    49  		return 0, nil
    50  	}
    51  	// Copy in the whole nodemask.
    52  	numUint64 := (bits + 63) / 64
    53  	buf := t.CopyScratchBuffer(int(numUint64) * 8)
    54  	if _, err := t.CopyInBytes(addr, buf); err != nil {
    55  		return 0, err
    56  	}
    57  	val := hostarch.ByteOrder.Uint64(buf)
    58  	// Check that only allowed bits in the first unsigned long in the nodemask
    59  	// are set.
    60  	if val&^allowedNodemask != 0 {
    61  		return 0, linuxerr.EINVAL
    62  	}
    63  	// Check that all remaining bits in the nodemask are 0.
    64  	for i := 8; i < len(buf); i++ {
    65  		if buf[i] != 0 {
    66  			return 0, linuxerr.EINVAL
    67  		}
    68  	}
    69  	return val, nil
    70  }
    71  
    72  func copyOutNodemask(t *kernel.Task, addr hostarch.Addr, maxnode uint32, val uint64) error {
    73  	// mm/mempolicy.c:copy_nodes_to_user() also uses maxnode-1 as the number of
    74  	// bits.
    75  	bits := maxnode - 1
    76  	if bits > hostarch.PageSize*8 { // also handles overflow from maxnode == 0
    77  		return linuxerr.EINVAL
    78  	}
    79  	if bits == 0 {
    80  		return nil
    81  	}
    82  	// Copy out the first unsigned long in the nodemask.
    83  	buf := t.CopyScratchBuffer(8)
    84  	hostarch.ByteOrder.PutUint64(buf, val)
    85  	if _, err := t.CopyOutBytes(addr, buf); err != nil {
    86  		return err
    87  	}
    88  	// Zero out remaining unsigned longs in the nodemask.
    89  	if bits > 64 {
    90  		remAddr, ok := addr.AddLength(8)
    91  		if !ok {
    92  			return linuxerr.EFAULT
    93  		}
    94  		remUint64 := (bits - 1) / 64
    95  		if _, err := t.MemoryManager().ZeroOut(t, remAddr, int64(remUint64)*8, usermem.IOOpts{
    96  			AddressSpaceActive: true,
    97  		}); err != nil {
    98  			return err
    99  		}
   100  	}
   101  	return nil
   102  }
   103  
   104  // GetMempolicy implements the syscall get_mempolicy(2).
   105  func GetMempolicy(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   106  	mode := args[0].Pointer()
   107  	nodemask := args[1].Pointer()
   108  	maxnode := args[2].Uint()
   109  	addr := args[3].Pointer()
   110  	flags := args[4].Uint()
   111  
   112  	if flags&^(linux.MPOL_F_NODE|linux.MPOL_F_ADDR|linux.MPOL_F_MEMS_ALLOWED) != 0 {
   113  		return 0, nil, linuxerr.EINVAL
   114  	}
   115  	nodeFlag := flags&linux.MPOL_F_NODE != 0
   116  	addrFlag := flags&linux.MPOL_F_ADDR != 0
   117  	memsAllowed := flags&linux.MPOL_F_MEMS_ALLOWED != 0
   118  
   119  	// "EINVAL: The value specified by maxnode is less than the number of node
   120  	// IDs supported by the system." - get_mempolicy(2)
   121  	if nodemask != 0 && maxnode < maxNodes {
   122  		return 0, nil, linuxerr.EINVAL
   123  	}
   124  
   125  	// "If flags specifies MPOL_F_MEMS_ALLOWED [...], the mode argument is
   126  	// ignored and the set of nodes (memories) that the thread is allowed to
   127  	// specify in subsequent calls to mbind(2) or set_mempolicy(2) (in the
   128  	// absence of any mode flags) is returned in nodemask."
   129  	if memsAllowed {
   130  		// "It is not permitted to combine MPOL_F_MEMS_ALLOWED with either
   131  		// MPOL_F_ADDR or MPOL_F_NODE."
   132  		if nodeFlag || addrFlag {
   133  			return 0, nil, linuxerr.EINVAL
   134  		}
   135  		if err := copyOutNodemask(t, nodemask, maxnode, allowedNodemask); err != nil {
   136  			return 0, nil, err
   137  		}
   138  		return 0, nil, nil
   139  	}
   140  
   141  	// "If flags specifies MPOL_F_ADDR, then information is returned about the
   142  	// policy governing the memory address given in addr. ... If the mode
   143  	// argument is not NULL, then get_mempolicy() will store the policy mode
   144  	// and any optional mode flags of the requested NUMA policy in the location
   145  	// pointed to by this argument. If nodemask is not NULL, then the nodemask
   146  	// associated with the policy will be stored in the location pointed to by
   147  	// this argument."
   148  	if addrFlag {
   149  		policy, nodemaskVal, err := t.MemoryManager().NumaPolicy(addr)
   150  		if err != nil {
   151  			return 0, nil, err
   152  		}
   153  		if nodeFlag {
   154  			// "If flags specifies both MPOL_F_NODE and MPOL_F_ADDR,
   155  			// get_mempolicy() will return the node ID of the node on which the
   156  			// address addr is allocated into the location pointed to by mode.
   157  			// If no page has yet been allocated for the specified address,
   158  			// get_mempolicy() will allocate a page as if the thread had
   159  			// performed a read (load) access to that address, and return the
   160  			// ID of the node where that page was allocated."
   161  			buf := t.CopyScratchBuffer(1)
   162  			_, err := t.CopyInBytes(addr, buf)
   163  			if err != nil {
   164  				return 0, nil, err
   165  			}
   166  			policy = linux.MPOL_DEFAULT // maxNodes == 1
   167  		}
   168  		if mode != 0 {
   169  			if _, err := policy.CopyOut(t, mode); err != nil {
   170  				return 0, nil, err
   171  			}
   172  		}
   173  		if nodemask != 0 {
   174  			if err := copyOutNodemask(t, nodemask, maxnode, nodemaskVal); err != nil {
   175  				return 0, nil, err
   176  			}
   177  		}
   178  		return 0, nil, nil
   179  	}
   180  
   181  	// "EINVAL: ... flags specified MPOL_F_ADDR and addr is NULL, or flags did
   182  	// not specify MPOL_F_ADDR and addr is not NULL." This is partially
   183  	// inaccurate: if flags specifies MPOL_F_ADDR,
   184  	// mm/mempolicy.c:do_get_mempolicy() doesn't special-case NULL; it will
   185  	// just (usually) fail to find a VMA at address 0 and return EFAULT.
   186  	if addr != 0 {
   187  		return 0, nil, linuxerr.EINVAL
   188  	}
   189  
   190  	// "If flags is specified as 0, then information about the calling thread's
   191  	// default policy (as set by set_mempolicy(2)) is returned, in the buffers
   192  	// pointed to by mode and nodemask. ... If flags specifies MPOL_F_NODE, but
   193  	// not MPOL_F_ADDR, and the thread's current policy is MPOL_INTERLEAVE,
   194  	// then get_mempolicy() will return in the location pointed to by a
   195  	// non-NULL mode argument, the node ID of the next node that will be used
   196  	// for interleaving of internal kernel pages allocated on behalf of the
   197  	// thread."
   198  	policy, nodemaskVal := t.NumaPolicy()
   199  	if nodeFlag {
   200  		if policy&^linux.MPOL_MODE_FLAGS != linux.MPOL_INTERLEAVE {
   201  			return 0, nil, linuxerr.EINVAL
   202  		}
   203  		policy = linux.MPOL_DEFAULT // maxNodes == 1
   204  	}
   205  	if mode != 0 {
   206  		if _, err := policy.CopyOut(t, mode); err != nil {
   207  			return 0, nil, err
   208  		}
   209  	}
   210  	if nodemask != 0 {
   211  		if err := copyOutNodemask(t, nodemask, maxnode, nodemaskVal); err != nil {
   212  			return 0, nil, err
   213  		}
   214  	}
   215  	return 0, nil, nil
   216  }
   217  
   218  // SetMempolicy implements the syscall set_mempolicy(2).
   219  func SetMempolicy(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   220  	modeWithFlags := linux.NumaPolicy(args[0].Int())
   221  	nodemask := args[1].Pointer()
   222  	maxnode := args[2].Uint()
   223  
   224  	modeWithFlags, nodemaskVal, err := copyInMempolicyNodemask(t, modeWithFlags, nodemask, maxnode)
   225  	if err != nil {
   226  		return 0, nil, err
   227  	}
   228  
   229  	t.SetNumaPolicy(modeWithFlags, nodemaskVal)
   230  	return 0, nil, nil
   231  }
   232  
   233  // Mbind implements the syscall mbind(2).
   234  func Mbind(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   235  	addr := args[0].Pointer()
   236  	length := args[1].Uint64()
   237  	mode := linux.NumaPolicy(args[2].Int())
   238  	nodemask := args[3].Pointer()
   239  	maxnode := args[4].Uint()
   240  	flags := args[5].Uint()
   241  
   242  	if flags&^linux.MPOL_MF_VALID != 0 {
   243  		return 0, nil, linuxerr.EINVAL
   244  	}
   245  	// "If MPOL_MF_MOVE_ALL is passed in flags ... [the] calling thread must be
   246  	// privileged (CAP_SYS_NICE) to use this flag." - mbind(2)
   247  	if flags&linux.MPOL_MF_MOVE_ALL != 0 && !t.HasCapability(linux.CAP_SYS_NICE) {
   248  		return 0, nil, linuxerr.EPERM
   249  	}
   250  
   251  	mode, nodemaskVal, err := copyInMempolicyNodemask(t, mode, nodemask, maxnode)
   252  	if err != nil {
   253  		return 0, nil, err
   254  	}
   255  
   256  	// Since we claim to have only a single node, all flags can be ignored
   257  	// (since all pages must already be on that single node).
   258  	err = t.MemoryManager().SetNumaPolicy(addr, length, mode, nodemaskVal)
   259  	return 0, nil, err
   260  }
   261  
   262  func copyInMempolicyNodemask(t *kernel.Task, modeWithFlags linux.NumaPolicy, nodemask hostarch.Addr, maxnode uint32) (linux.NumaPolicy, uint64, error) {
   263  	flags := linux.NumaPolicy(modeWithFlags & linux.MPOL_MODE_FLAGS)
   264  	mode := linux.NumaPolicy(modeWithFlags &^ linux.MPOL_MODE_FLAGS)
   265  	if flags == linux.MPOL_MODE_FLAGS {
   266  		// Can't specify both mode flags simultaneously.
   267  		return 0, 0, linuxerr.EINVAL
   268  	}
   269  	if mode < 0 || mode >= linux.MPOL_MAX {
   270  		// Must specify a valid mode.
   271  		return 0, 0, linuxerr.EINVAL
   272  	}
   273  
   274  	var nodemaskVal uint64
   275  	if nodemask != 0 {
   276  		var err error
   277  		nodemaskVal, err = copyInNodemask(t, nodemask, maxnode)
   278  		if err != nil {
   279  			return 0, 0, err
   280  		}
   281  	}
   282  
   283  	switch mode {
   284  	case linux.MPOL_DEFAULT:
   285  		// "nodemask must be specified as NULL." - set_mempolicy(2). This is inaccurate;
   286  		// Linux allows a nodemask to be specified, as long as it is empty.
   287  		if nodemaskVal != 0 {
   288  			return 0, 0, linuxerr.EINVAL
   289  		}
   290  	case linux.MPOL_BIND, linux.MPOL_INTERLEAVE:
   291  		// These require a non-empty nodemask.
   292  		if nodemaskVal == 0 {
   293  			return 0, 0, linuxerr.EINVAL
   294  		}
   295  	case linux.MPOL_PREFERRED:
   296  		// This permits an empty nodemask, as long as no flags are set.
   297  		if nodemaskVal == 0 {
   298  			if flags != 0 {
   299  				return 0, 0, linuxerr.EINVAL
   300  			}
   301  			// On newer Linux versions, MPOL_PREFERRED is implemented as MPOL_LOCAL
   302  			// when node set is empty. See 7858d7bca7fb ("mm/mempolicy: don't handle
   303  			// MPOL_LOCAL like a fake MPOL_PREFERRED policy").
   304  			mode = linux.MPOL_LOCAL
   305  		}
   306  	case linux.MPOL_LOCAL:
   307  		// This requires an empty nodemask and no flags set.
   308  		if nodemaskVal != 0 || flags != 0 {
   309  			return 0, 0, linuxerr.EINVAL
   310  		}
   311  	default:
   312  		// Unknown mode, which we should have rejected above.
   313  		panic(fmt.Sprintf("unknown mode: %v", mode))
   314  	}
   315  
   316  	return mode | flags, nodemaskVal, nil
   317  }