github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/platform/systrap/shared_context.go

github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/platform/systrap/shared_context.go (about)

     1  // Copyright 2023 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package systrap
    16  
    17  import (
    18  	"fmt"
    19  	"runtime"
    20  	"strconv"
    21  	"sync"
    22  	"sync/atomic"
    23  	"time"
    24  
    25  	"github.com/MerlinKodo/gvisor/pkg/log"
    26  	"github.com/MerlinKodo/gvisor/pkg/sentry/platform"
    27  	"github.com/MerlinKodo/gvisor/pkg/sentry/platform/systrap/sysmsg"
    28  	"github.com/MerlinKodo/gvisor/pkg/syncevent"
    29  	"golang.org/x/sys/unix"
    30  )
    31  
    32  const (
    33  	ackReset uint32 = 0
    34  )
    35  
    36  // sharedContext is an abstraction for interactions that the sentry has to
    37  // perform with memory shared between it and the stub threads used for contexts.
    38  //
    39  // Any access to shared memory should most likely have a getter/setter through
    40  // this struct. This is due to the following reasons:
    41  //   - The memory needs to be read or modified atomically because there is no
    42  //     (trusted) synchronization between the sentry and the stub processes.
    43  //   - Data read from shared memory may require validation before it can be used.
    44  type sharedContext struct {
    45  	contextEntry
    46  
    47  	// subprocess is the subprocess that this sharedContext instance belongs to.
    48  	subprocess *subprocess
    49  	// contextID is the ID corresponding to the sysmsg.ThreadContext memory slot
    50  	// that is used for this sharedContext.
    51  	contextID uint32
    52  	// shared is the handle to the shared memory that the sentry task go-routine
    53  	// reads from and writes to.
    54  	// NOTE: Using this handle directly without a getter from this function should
    55  	//       most likely be avoided due to concerns listed above.
    56  	shared *sysmsg.ThreadContext
    57  
    58  	// sync is used by the context go-routine to wait for events from the
    59  	// dispatcher.
    60  	sync           syncevent.Waiter
    61  	startWaitingTS int64
    62  	kicked         bool
    63  	// The task associated with the context fell asleep.
    64  	sleeping bool
    65  }
    66  
    67  // String returns the ID of this shared context.
    68  func (sc *sharedContext) String() string {
    69  	return strconv.Itoa(int(sc.contextID))
    70  }
    71  
    72  const (
    73  	// sharedContextReady indicates that a context has new events.
    74  	sharedContextReady = syncevent.Set(1 << iota)
    75  	// sharedContextKicked indicates that a new stub thread should be woken up.
    76  	sharedContextKicked
    77  	// sharedContextSlowPath indicates that a context has to be waited for in the
    78  	// slow path.
    79  	sharedContextSlowPath
    80  	// sharedContextDispatch indicates that a context go-routine has to start the wait loop.
    81  	sharedContextDispatch
    82  )
    83  
    84  func (s *subprocess) getSharedContext() (*sharedContext, error) {
    85  	s.mu.Lock()
    86  	defer s.mu.Unlock()
    87  
    88  	id, ok := s.threadContextPool.Get()
    89  	if !ok {
    90  		return nil, fmt.Errorf("subprocess has too many active tasks (%d); failed to create a new one", maxGuestContexts)
    91  	}
    92  	s.IncRef()
    93  	sc := sharedContext{
    94  		subprocess: s,
    95  		contextID:  uint32(id),
    96  		shared:     s.getThreadContextFromID(id),
    97  	}
    98  	sc.shared.Init(invalidThreadID)
    99  	sc.sync.Init()
   100  	sc.sleeping = true
   101  
   102  	return &sc, nil
   103  }
   104  
   105  func (sc *sharedContext) release() {
   106  	if sc == nil {
   107  		return
   108  	}
   109  	if !sc.sleeping {
   110  		sc.subprocess.decAwakeContexts()
   111  
   112  	}
   113  	sc.subprocess.threadContextPool.Put(uint64(sc.contextID))
   114  	sc.subprocess.DecRef(sc.subprocess.release)
   115  }
   116  
   117  func (sc *sharedContext) isActiveInSubprocess(s *subprocess) bool {
   118  	if sc == nil {
   119  		return false
   120  	}
   121  	return sc.subprocess == s
   122  }
   123  
   124  // NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt.
   125  func (sc *sharedContext) NotifyInterrupt() {
   126  	// If this context is not being worked on right now we need to mark it as
   127  	// interrupted so the next executor does not start working on it.
   128  	atomic.StoreUint32(&sc.shared.Interrupt, 1)
   129  	if sc.threadID() == invalidThreadID {
   130  		return
   131  	}
   132  	sc.subprocess.sysmsgThreadsMu.Lock()
   133  	defer sc.subprocess.sysmsgThreadsMu.Unlock()
   134  
   135  	threadID := atomic.LoadUint32(&sc.shared.ThreadID)
   136  	sysmsgThread, ok := sc.subprocess.sysmsgThreads[threadID]
   137  	if !ok {
   138  		// This is either an invalidThreadID or another garbage value; either way we
   139  		// don't know which thread to interrupt; best we can do is mark the context.
   140  		return
   141  	}
   142  
   143  	t := sysmsgThread.thread
   144  	if _, _, e := unix.RawSyscall(unix.SYS_TGKILL, uintptr(t.tgid), uintptr(t.tid), uintptr(platform.SignalInterrupt)); e != 0 {
   145  		panic(fmt.Sprintf("failed to interrupt the child process %d: %v", t.tid, e))
   146  	}
   147  }
   148  
   149  func (sc *sharedContext) state() sysmsg.ContextState {
   150  	return sc.shared.State.Get()
   151  }
   152  
   153  func (sc *sharedContext) setState(state sysmsg.ContextState) {
   154  	sc.shared.State.Set(state)
   155  }
   156  
   157  func (sc *sharedContext) setInterrupt() {
   158  	atomic.StoreUint32(&sc.shared.Interrupt, 1)
   159  }
   160  
   161  func (sc *sharedContext) clearInterrupt() {
   162  	atomic.StoreUint32(&sc.shared.Interrupt, 0)
   163  }
   164  
   165  func (sc *sharedContext) setFPStateChanged() {
   166  	atomic.StoreUint64(&sc.shared.FPStateChanged, 1)
   167  }
   168  
   169  func (sc *sharedContext) threadID() uint32 {
   170  	return atomic.LoadUint32(&sc.shared.ThreadID)
   171  }
   172  
   173  // EnableSentryFastPath indicates that the polling mode is enabled for the
   174  // Sentry. It has to be called before putting the context into the context queue.
   175  func (sc *sharedContext) enableSentryFastPath() {
   176  	atomic.StoreUint32(&sc.shared.SentryFastPath, 1)
   177  }
   178  
   179  // DisableSentryFastPath indicates that the polling mode for the sentry is
   180  // disabled for the Sentry.
   181  func (sc *sharedContext) disableSentryFastPath() {
   182  	atomic.StoreUint32(&sc.shared.SentryFastPath, 0)
   183  }
   184  
   185  func (sc *sharedContext) isAcked() bool {
   186  	return atomic.LoadUint32(&sc.shared.Acked) != ackReset
   187  }
   188  
   189  func (sc *sharedContext) resetAcked() {
   190  	atomic.StoreUint32(&sc.shared.Acked, ackReset)
   191  }
   192  
   193  const (
   194  	contextPreemptTimeoutNsec = 10 * 1000 * 1000 // 10ms
   195  	contextCheckupTimeoutSec  = 5
   196  	stuckContextTimeout       = 30 * time.Second
   197  )
   198  
   199  func (sc *sharedContext) sleepOnState(state sysmsg.ContextState) {
   200  	timeout := unix.Timespec{
   201  		Sec:  0,
   202  		Nsec: contextPreemptTimeoutNsec,
   203  	}
   204  	sentInterruptOnce := false
   205  	deadline := time.Now().Add(stuckContextTimeout)
   206  	for sc.state() == state {
   207  		errno := sc.shared.SleepOnState(state, &timeout)
   208  		if errno == 0 {
   209  			continue
   210  		}
   211  		if errno != unix.ETIMEDOUT {
   212  			panic(fmt.Sprintf("error waiting for state: %v", errno))
   213  		}
   214  		if time.Now().After(deadline) {
   215  			log.Warningf("Systrap task goroutine has been waiting on ThreadContext.State futex too long. ThreadContext: %v", sc)
   216  		}
   217  		if sentInterruptOnce {
   218  			log.Warningf("The context is still running: %v", sc)
   219  			continue
   220  		}
   221  
   222  		if !sc.isAcked() || sc.subprocess.contextQueue.isEmpty() {
   223  			continue
   224  		}
   225  		sc.NotifyInterrupt()
   226  		sentInterruptOnce = true
   227  		timeout.Sec = contextCheckupTimeoutSec
   228  		timeout.Nsec = 0
   229  	}
   230  }
   231  
   232  type fastPathDispatcher struct {
   233  	// list is used only from the loop method and so it isn't protected by
   234  	// any lock.
   235  	list contextList
   236  
   237  	mu sync.Mutex
   238  
   239  	// nr is the number of contexts in the queue.
   240  	// +checklocks:mu
   241  	nr int
   242  
   243  	// entrants contains new contexts that haven't been added to `list` yet.
   244  	// +checklocks:mu
   245  	entrants contextList
   246  
   247  	// fastPathDisabledTS is the time stamp when the stub fast path was
   248  	// disabled. It is zero if the fast path is enabled.
   249  	fastPathDisabledTS atomic.Uint64
   250  }
   251  
   252  var dispatcher fastPathDispatcher
   253  
   254  // fastPathContextLimit is the maximum number of contexts after which the fast
   255  // path in stub threads is disabled. Its value can be higher than the number of
   256  // CPU-s, because the Sentry is running with higher priority than stub threads,
   257  // deepSleepTimeout is much shorter than the Linux scheduler timeslice, so the
   258  // only thing that matters here is whether the Sentry handles syscall faster
   259  // than the overhead of scheduling another stub thread.
   260  var fastPathContextLimit = uint32(runtime.GOMAXPROCS(0) * 2)
   261  
   262  // fastPathDisabledTimeout is the timeout after which the fast path in stub
   263  // processes will be re-enabled.
   264  const fastPathDisabledTimeout = uint64(200 * 1000 * 1000) // 100ms for 2GHz.
   265  
   266  // nrMaxAwakeStubThreads is the maximum number of awake stub threads over all
   267  // subprocesses at the this moment.
   268  var nrMaxAwakeStubThreads atomic.Uint32
   269  
   270  // stubFastPathEnabled returns true if the fast path in stub processes is
   271  // enabled. If the fast path is disabled, it revises whether it has to be
   272  // re-enabled or not.
   273  func (q *fastPathDispatcher) stubFastPathEnabled() bool {
   274  	ts := q.fastPathDisabledTS.Load()
   275  	if ts != 0 {
   276  		if uint64(cputicks())-ts < fastPathDisabledTimeout {
   277  			return false
   278  		}
   279  		if nrMaxAwakeStubThreads.Load() > fastPathContextLimit {
   280  			q.fastPathDisabledTS.Store(uint64(cputicks()))
   281  			return false
   282  		}
   283  		q.fastPathDisabledTS.Store(0)
   284  	}
   285  	return true
   286  }
   287  
   288  // disableStubFastPath disables the fast path over all subprocesses with active
   289  // contexts.
   290  func (q *fastPathDispatcher) disableStubFastPath() {
   291  	q.fastPathDisabledTS.Store(uint64(cputicks()))
   292  }
   293  
   294  // deep_sleep_timeout is the timeout after which we stops polling and fall asleep.
   295  //
   296  // The value is 40µs for 2GHz CPU. This timeout matches the sentry<->stub round
   297  // trip in the pure deep sleep case.
   298  const deepSleepTimeout = uint64(80000)
   299  const handshakeTimeout = uint64(1000)
   300  
   301  // loop is processing contexts in the queue. Only one instance of it can be
   302  // running, because it has exclusive access to the list.
   303  //
   304  // target is the context associated with the current go-routine.
   305  func (q *fastPathDispatcher) loop(target *sharedContext) {
   306  	done := false
   307  	processed := 0
   308  	slowPath := false
   309  	start := cputicks()
   310  	for {
   311  		var ctx, next *sharedContext
   312  
   313  		q.mu.Lock()
   314  		if processed != 0 || !q.entrants.Empty() {
   315  			start = cputicks()
   316  			slowPath = false
   317  		}
   318  		q.nr -= processed
   319  		// Add new contexts to the list.
   320  		q.list.PushBackList(&q.entrants)
   321  		ctx = q.list.Front()
   322  		q.mu.Unlock()
   323  
   324  		if done {
   325  			if ctx != nil {
   326  				// Wake up the next go-routine to run the loop.
   327  				ctx.sync.Receiver().Notify(sharedContextDispatch)
   328  			}
   329  			break
   330  		}
   331  
   332  		processed = 0
   333  		now := cputicks()
   334  		for ctx = q.list.Front(); ctx != nil; ctx = next {
   335  			next = ctx.Next()
   336  
   337  			event := sharedContextReady
   338  			if ctx.state() == sysmsg.ContextStateNone {
   339  				if slowPath {
   340  					event = sharedContextSlowPath
   341  				} else if !ctx.kicked && uint64(now-ctx.startWaitingTS) > handshakeTimeout {
   342  					if ctx.isAcked() {
   343  						ctx.kicked = true
   344  						continue
   345  					}
   346  					event = sharedContextKicked
   347  				} else {
   348  					continue
   349  				}
   350  			}
   351  			processed++
   352  			q.list.Remove(ctx)
   353  			if ctx == target {
   354  				done = true
   355  			}
   356  			ctx.sync.Receiver().Notify(event)
   357  		}
   358  		if processed == 0 {
   359  			if uint64(cputicks()-start) > deepSleepTimeout {
   360  				slowPath = true
   361  				// Do one more run to notify all contexts.
   362  				// q.list has to be empty at the end.
   363  				continue
   364  			}
   365  			yield()
   366  		}
   367  	}
   368  }
   369  
   370  func (q *fastPathDispatcher) waitFor(ctx *sharedContext) syncevent.Set {
   371  	events := syncevent.Set(0)
   372  
   373  	q.mu.Lock()
   374  	q.entrants.PushBack(ctx)
   375  	q.nr++
   376  	if q.nr == 1 {
   377  		events = sharedContextDispatch
   378  	}
   379  	q.mu.Unlock()
   380  
   381  	for {
   382  		if events&sharedContextDispatch != 0 {
   383  			ctx.sync.Ack(sharedContextDispatch)
   384  			q.loop(ctx)
   385  		}
   386  		events = ctx.sync.WaitAndAckAll()
   387  		if events&sharedContextDispatch == 0 {
   388  			break
   389  		}
   390  	}
   391  	return events
   392  }