github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/platform/systrap/shared_context.go

github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/platform/systrap/shared_context.go (about)

     1  // Copyright 2023 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package systrap
    16  
    17  import (
    18  	"fmt"
    19  	"strconv"
    20  	"sync"
    21  	"sync/atomic"
    22  	"time"
    23  
    24  	"golang.org/x/sys/unix"
    25  	"github.com/metacubex/gvisor/pkg/log"
    26  	"github.com/metacubex/gvisor/pkg/sentry/platform"
    27  	"github.com/metacubex/gvisor/pkg/sentry/platform/systrap/sysmsg"
    28  	"github.com/metacubex/gvisor/pkg/syncevent"
    29  )
    30  
    31  const (
    32  	ackReset          uint64 = 0
    33  	stateChangedReset uint64 = 0
    34  )
    35  
    36  // sharedContext is an abstraction for interactions that the sentry has to
    37  // perform with memory shared between it and the stub threads used for contexts.
    38  //
    39  // Any access to shared memory should most likely have a getter/setter through
    40  // this struct. This is due to the following reasons:
    41  //   - The memory needs to be read or modified atomically because there is no
    42  //     (trusted) synchronization between the sentry and the stub processes.
    43  //   - Data read from shared memory may require validation before it can be used.
    44  type sharedContext struct {
    45  	contextEntry
    46  
    47  	// subprocess is the subprocess that this sharedContext instance belongs to.
    48  	subprocess *subprocess
    49  	// contextID is the ID corresponding to the sysmsg.ThreadContext memory slot
    50  	// that is used for this sharedContext.
    51  	contextID uint32
    52  	// shared is the handle to the shared memory that the sentry task go-routine
    53  	// reads from and writes to.
    54  	// NOTE: Using this handle directly without a getter from this function should
    55  	//       most likely be avoided due to concerns listed above.
    56  	shared *sysmsg.ThreadContext
    57  
    58  	// sync is used by the context go-routine to wait for events from the
    59  	// dispatcher.
    60  	sync           syncevent.Waiter
    61  	startWaitingTS int64
    62  	kicked         bool
    63  	// The task associated with the context fell asleep.
    64  	sleeping bool
    65  }
    66  
    67  // String returns the ID of this shared context.
    68  func (sc *sharedContext) String() string {
    69  	return strconv.Itoa(int(sc.contextID))
    70  }
    71  
    72  const (
    73  	// sharedContextReady indicates that a context has new events.
    74  	sharedContextReady = syncevent.Set(1 << iota)
    75  	// sharedContextKicked indicates that a new stub thread should be woken up.
    76  	sharedContextKicked
    77  	// sharedContextSlowPath indicates that a context has to be waited for in the
    78  	// slow path.
    79  	sharedContextSlowPath
    80  	// sharedContextDispatch indicates that a context go-routine has to start the wait loop.
    81  	sharedContextDispatch
    82  )
    83  
    84  func (s *subprocess) getSharedContext() (*sharedContext, error) {
    85  	s.mu.Lock()
    86  	defer s.mu.Unlock()
    87  
    88  	id, ok := s.threadContextPool.Get()
    89  	if !ok {
    90  		return nil, fmt.Errorf("subprocess has too many active tasks (%d); failed to create a new one", maxGuestContexts)
    91  	}
    92  	s.IncRef()
    93  	sc := sharedContext{
    94  		subprocess: s,
    95  		contextID:  uint32(id),
    96  		shared:     s.getThreadContextFromID(id),
    97  	}
    98  	sc.shared.Init(invalidThreadID)
    99  	sc.sync.Init()
   100  	sc.sleeping = true
   101  
   102  	return &sc, nil
   103  }
   104  
   105  func (sc *sharedContext) release() {
   106  	if sc == nil {
   107  		return
   108  	}
   109  	if !sc.sleeping {
   110  		sc.subprocess.decAwakeContexts()
   111  	}
   112  	sc.subprocess.threadContextPool.Put(uint64(sc.contextID))
   113  	sc.subprocess.DecRef(sc.subprocess.release)
   114  }
   115  
   116  func (sc *sharedContext) isActiveInSubprocess(s *subprocess) bool {
   117  	if sc == nil {
   118  		return false
   119  	}
   120  	return sc.subprocess == s
   121  }
   122  
   123  // NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt.
   124  func (sc *sharedContext) NotifyInterrupt() {
   125  	// If this context is not being worked on right now we need to mark it as
   126  	// interrupted so the next executor does not start working on it.
   127  	atomic.StoreUint32(&sc.shared.Interrupt, 1)
   128  	if sc.threadID() == invalidThreadID {
   129  		return
   130  	}
   131  	sc.subprocess.sysmsgThreadsMu.Lock()
   132  	defer sc.subprocess.sysmsgThreadsMu.Unlock()
   133  
   134  	threadID := atomic.LoadUint32(&sc.shared.ThreadID)
   135  	sysmsgThread, ok := sc.subprocess.sysmsgThreads[threadID]
   136  	if !ok {
   137  		// This is either an invalidThreadID or another garbage value; either way we
   138  		// don't know which thread to interrupt; best we can do is mark the context.
   139  		return
   140  	}
   141  
   142  	t := sysmsgThread.thread
   143  	if _, _, e := unix.RawSyscall(unix.SYS_TGKILL, uintptr(t.tgid), uintptr(t.tid), uintptr(platform.SignalInterrupt)); e != 0 {
   144  		panic(fmt.Sprintf("failed to interrupt the child process %d: %v", t.tid, e))
   145  	}
   146  }
   147  
   148  func (sc *sharedContext) state() sysmsg.ContextState {
   149  	return sc.shared.State.Get()
   150  }
   151  
   152  func (sc *sharedContext) setState(state sysmsg.ContextState) {
   153  	sc.shared.State.Set(state)
   154  }
   155  
   156  func (sc *sharedContext) setInterrupt() {
   157  	atomic.StoreUint32(&sc.shared.Interrupt, 1)
   158  }
   159  
   160  func (sc *sharedContext) clearInterrupt() {
   161  	atomic.StoreUint32(&sc.shared.Interrupt, 0)
   162  }
   163  
   164  func (sc *sharedContext) setFPStateChanged() {
   165  	atomic.StoreUint64(&sc.shared.FPStateChanged, 1)
   166  }
   167  
   168  func (sc *sharedContext) threadID() uint32 {
   169  	return atomic.LoadUint32(&sc.shared.ThreadID)
   170  }
   171  
   172  // EnableSentryFastPath indicates that the polling mode is enabled for the
   173  // Sentry. It has to be called before putting the context into the context queue.
   174  func (sc *sharedContext) enableSentryFastPath() {
   175  	atomic.StoreUint32(&sc.shared.SentryFastPath, 1)
   176  }
   177  
   178  // DisableSentryFastPath indicates that the polling mode for the sentry is
   179  // disabled for the Sentry.
   180  func (sc *sharedContext) disableSentryFastPath() {
   181  	atomic.StoreUint32(&sc.shared.SentryFastPath, 0)
   182  }
   183  
   184  func (sc *sharedContext) isAcked() bool {
   185  	return atomic.LoadUint64(&sc.shared.AckedTime) != ackReset
   186  }
   187  
   188  // getAckedTimeDiff returns the time difference between when this context was
   189  // put into the context queue, and when this context was acked by a stub thread.
   190  // Precondition: must be called after isAcked() == true.
   191  //
   192  //go:nosplit
   193  func (sc *sharedContext) getAckedTimeDiff() cpuTicks {
   194  	ackedAt := atomic.LoadUint64(&sc.shared.AckedTime)
   195  	if ackedAt < uint64(sc.startWaitingTS) {
   196  		log.Infof("likely memory tampering detected: found a condition where ackedAt (%d) < startWaitingTS (%d)", ackedAt, uint64(sc.startWaitingTS))
   197  		return 0
   198  	}
   199  	return cpuTicks(ackedAt - uint64(sc.startWaitingTS))
   200  }
   201  
   202  // getStateChangedTimeDiff returns the time difference between the time the
   203  // context state got changed by a stub thread, and now.
   204  //
   205  //go:nosplit
   206  func (sc *sharedContext) getStateChangedTimeDiff() cpuTicks {
   207  	changedAt := atomic.LoadUint64(&sc.shared.StateChangedTime)
   208  	now := uint64(cputicks())
   209  	if now < changedAt {
   210  		log.Infof("likely memory tampering detected: found a condition where now (%d) < changedAt (%d)", now, changedAt)
   211  		return 0
   212  	}
   213  	return cpuTicks(now - changedAt)
   214  }
   215  
   216  func (sc *sharedContext) resetLatencyMeasures() {
   217  	atomic.StoreUint64(&sc.shared.AckedTime, ackReset)
   218  	atomic.StoreUint64(&sc.shared.StateChangedTime, stateChangedReset)
   219  }
   220  
   221  const (
   222  	contextPreemptTimeoutNsec = 10 * 1000 * 1000 // 10ms
   223  	contextCheckupTimeoutSec  = 5
   224  	stuckContextTimeout       = 30 * time.Second
   225  )
   226  
   227  var errDeadSubprocess = fmt.Errorf("subprocess died")
   228  
   229  func (sc *sharedContext) sleepOnState(state sysmsg.ContextState) error {
   230  	timeout := unix.Timespec{
   231  		Sec:  0,
   232  		Nsec: contextPreemptTimeoutNsec,
   233  	}
   234  	sentInterruptOnce := false
   235  	deadline := time.Now().Add(stuckContextTimeout)
   236  	for sc.state() == state {
   237  		errno := sc.shared.SleepOnState(state, &timeout)
   238  		if errno == 0 {
   239  			continue
   240  		}
   241  		if errno != unix.ETIMEDOUT {
   242  			panic(fmt.Sprintf("error waiting for state: %v", errno))
   243  		}
   244  		if !sc.subprocess.alive() {
   245  			return errDeadSubprocess
   246  		}
   247  		if time.Now().After(deadline) {
   248  			log.Warningf("Systrap task goroutine has been waiting on ThreadContext.State futex too long. ThreadContext: %v", sc)
   249  		}
   250  		if sentInterruptOnce {
   251  			log.Warningf("The context is still running: %v", sc)
   252  			continue
   253  		}
   254  
   255  		if !sc.isAcked() || sc.subprocess.contextQueue.isEmpty() {
   256  			continue
   257  		}
   258  		sc.NotifyInterrupt()
   259  		sentInterruptOnce = true
   260  		timeout.Sec = contextCheckupTimeoutSec
   261  		timeout.Nsec = 0
   262  	}
   263  	return nil
   264  }
   265  
   266  type fastPathDispatcher struct {
   267  	// list is used only from the loop method and so it isn't protected by
   268  	// any lock.
   269  	list contextList
   270  
   271  	mu sync.Mutex
   272  
   273  	// nr is the number of contexts in the queue.
   274  	// +checklocks:mu
   275  	nr int
   276  
   277  	// entrants contains new contexts that haven't been added to `list` yet.
   278  	// +checklocks:mu
   279  	entrants contextList
   280  }
   281  
   282  var dispatcher fastPathDispatcher
   283  
   284  const (
   285  	// deepSleepTimeout is the timeout after which both stub threads and the
   286  	// dispatcher consider whether to stop polling. They need to have elapsed
   287  	// this timeout twice in a row in order to stop, so the actual timeout
   288  	// can be considered to be (deepSleepTimeout*2). Falling asleep after two
   289  	// shorter timeouts instead of one long timeout is done in order to
   290  	// mitigate the effects of rdtsc inaccuracies.
   291  	//
   292  	// The value is 20µs for 2GHz CPU. 40µs matches the sentry<->stub
   293  	// round trip in the pure deep sleep case.
   294  	deepSleepTimeout = uint64(40000)
   295  	handshakeTimeout = uint64(1000)
   296  )
   297  
   298  // loop is processing contexts in the queue. Only one instance of it can be
   299  // running, because it has exclusive access to the list.
   300  //
   301  // target is the context associated with the current go-routine.
   302  func (q *fastPathDispatcher) loop(target *sharedContext) {
   303  	done := false
   304  	processed := 0
   305  	firstTimeout := false
   306  	slowPath := false
   307  	startedSpinning := cputicks()
   308  	for {
   309  		var ctx, next *sharedContext
   310  
   311  		q.mu.Lock()
   312  		q.nr -= processed
   313  		// Add new contexts to the list.
   314  		q.list.PushBackList(&q.entrants)
   315  		ctx = q.list.Front()
   316  		q.mu.Unlock()
   317  
   318  		if done {
   319  			if ctx != nil {
   320  				// Wake up the next go-routine to run the loop.
   321  				ctx.sync.Receiver().Notify(sharedContextDispatch)
   322  			}
   323  			break
   324  		}
   325  
   326  		slowPath = !fastpath.sentryFastPath() || slowPath
   327  		processed = 0
   328  		now := cputicks()
   329  		for ctx = q.list.Front(); ctx != nil; ctx = next {
   330  			next = ctx.Next()
   331  
   332  			event := sharedContextReady
   333  			if ctx.state() == sysmsg.ContextStateNone {
   334  				if slowPath {
   335  					event = sharedContextSlowPath
   336  				} else if !ctx.kicked && uint64(now-ctx.startWaitingTS) > handshakeTimeout {
   337  					if ctx.isAcked() {
   338  						ctx.kicked = true
   339  						continue
   340  					}
   341  					event = sharedContextKicked
   342  				} else {
   343  					continue
   344  				}
   345  			}
   346  			processed++
   347  			q.list.Remove(ctx)
   348  			if ctx == target {
   349  				done = true
   350  			}
   351  			ctx.sync.Receiver().Notify(event)
   352  		}
   353  
   354  		if processed != 0 {
   355  			startedSpinning = now
   356  			firstTimeout = false
   357  		} else {
   358  			fastpath.usedSentryFastPath.Store(true)
   359  		}
   360  		// If dispatcher has been spinning for too long, send this
   361  		// dispatcher to sleep.
   362  		if uint64(now-startedSpinning) > deepSleepTimeout {
   363  			slowPath = firstTimeout
   364  			firstTimeout = true
   365  		}
   366  
   367  		yield()
   368  	}
   369  }
   370  
   371  func (q *fastPathDispatcher) waitFor(ctx *sharedContext) syncevent.Set {
   372  	events := syncevent.NoEvents
   373  
   374  	q.mu.Lock()
   375  	q.entrants.PushBack(ctx)
   376  	q.nr++
   377  	if q.nr == 1 {
   378  		events = sharedContextDispatch
   379  	}
   380  	q.mu.Unlock()
   381  
   382  	for {
   383  		if events&sharedContextDispatch != 0 {
   384  			ctx.sync.Ack(sharedContextDispatch)
   385  			q.loop(ctx)
   386  		}
   387  		events = ctx.sync.WaitAndAckAll()
   388  		if events&sharedContextDispatch == 0 {
   389  			break
   390  		}
   391  	}
   392  	return events
   393  }