github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/platform/systrap/shared_context.go

github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/platform/systrap/shared_context.go (about)

     1  // Copyright 2023 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package systrap
    16  
    17  import (
    18  	"fmt"
    19  	"runtime"
    20  	"strconv"
    21  	"sync"
    22  	"sync/atomic"
    23  	"time"
    24  
    25  	"golang.org/x/sys/unix"
    26  	"github.com/nicocha30/gvisor-ligolo/pkg/log"
    27  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/platform"
    28  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/platform/systrap/sysmsg"
    29  	"github.com/nicocha30/gvisor-ligolo/pkg/syncevent"
    30  )
    31  
    32  const (
    33  	ackReset uint32 = 0
    34  )
    35  
    36  // sharedContext is an abstraction for interactions that the sentry has to
    37  // perform with memory shared between it and the stub threads used for contexts.
    38  //
    39  // Any access to shared memory should most likely have a getter/setter through
    40  // this struct. This is due to the following reasons:
    41  //   - The memory needs to be read or modified atomically because there is no
    42  //     (trusted) synchronization between the sentry and the stub processes.
    43  //   - Data read from shared memory may require validation before it can be used.
    44  type sharedContext struct {
    45  	contextEntry
    46  
    47  	// subprocess is the subprocess that this sharedContext instance belongs to.
    48  	subprocess *subprocess
    49  	// contextID is the ID corresponding to the sysmsg.ThreadContext memory slot
    50  	// that is used for this sharedContext.
    51  	contextID uint32
    52  	// shared is the handle to the shared memory that the sentry task go-routine
    53  	// reads from and writes to.
    54  	// NOTE: Using this handle directly without a getter from this function should
    55  	//       most likely be avoided due to concerns listed above.
    56  	shared *sysmsg.ThreadContext
    57  
    58  	// sync is used by the context go-routine to wait for events from the
    59  	// dispatcher.
    60  	sync           syncevent.Waiter
    61  	startWaitingTS int64
    62  	kicked         bool
    63  	// The task associated with the context fell asleep.
    64  	sleeping bool
    65  }
    66  
    67  // String returns the ID of this shared context.
    68  func (sc *sharedContext) String() string {
    69  	return strconv.Itoa(int(sc.contextID))
    70  }
    71  
    72  const (
    73  	// sharedContextReady indicates that a context has new events.
    74  	sharedContextReady = syncevent.Set(1 << iota)
    75  	// sharedContextKicked indicates that a new stub thread should be woken up.
    76  	sharedContextKicked
    77  	// sharedContextSlowPath indicates that a context has to be waited for in the
    78  	// slow path.
    79  	sharedContextSlowPath
    80  	// sharedContextDispatch indicates that a context go-routine has to start the wait loop.
    81  	sharedContextDispatch
    82  )
    83  
    84  func (s *subprocess) getSharedContext() (*sharedContext, error) {
    85  	s.mu.Lock()
    86  	defer s.mu.Unlock()
    87  
    88  	id, ok := s.threadContextPool.Get()
    89  	if !ok {
    90  		return nil, fmt.Errorf("subprocess has too many active tasks (%d); failed to create a new one", maxGuestContexts)
    91  	}
    92  	s.IncRef()
    93  	sc := sharedContext{
    94  		subprocess: s,
    95  		contextID:  uint32(id),
    96  		shared:     s.getThreadContextFromID(id),
    97  	}
    98  	sc.shared.Init(invalidThreadID)
    99  	sc.sync.Init()
   100  	sc.sleeping = true
   101  
   102  	return &sc, nil
   103  }
   104  
   105  func (sc *sharedContext) release() {
   106  	if sc == nil {
   107  		return
   108  	}
   109  	if !sc.sleeping {
   110  		sc.subprocess.decAwakeContexts()
   111  
   112  	}
   113  	sc.subprocess.threadContextPool.Put(uint64(sc.contextID))
   114  	sc.subprocess.DecRef(sc.subprocess.release)
   115  }
   116  
   117  func (sc *sharedContext) isActiveInSubprocess(s *subprocess) bool {
   118  	if sc == nil {
   119  		return false
   120  	}
   121  	return sc.subprocess == s
   122  }
   123  
   124  // NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt.
   125  func (sc *sharedContext) NotifyInterrupt() {
   126  	// If this context is not being worked on right now we need to mark it as
   127  	// interrupted so the next executor does not start working on it.
   128  	atomic.StoreUint32(&sc.shared.Interrupt, 1)
   129  	if sc.threadID() == invalidThreadID {
   130  		return
   131  	}
   132  	sc.subprocess.sysmsgThreadsMu.Lock()
   133  	defer sc.subprocess.sysmsgThreadsMu.Unlock()
   134  
   135  	threadID := atomic.LoadUint32(&sc.shared.ThreadID)
   136  	sysmsgThread, ok := sc.subprocess.sysmsgThreads[threadID]
   137  	if !ok {
   138  		// This is either an invalidThreadID or another garbage value; either way we
   139  		// don't know which thread to interrupt; best we can do is mark the context.
   140  		return
   141  	}
   142  
   143  	t := sysmsgThread.thread
   144  	if _, _, e := unix.RawSyscall(unix.SYS_TGKILL, uintptr(t.tgid), uintptr(t.tid), uintptr(platform.SignalInterrupt)); e != 0 {
   145  		panic(fmt.Sprintf("failed to interrupt the child process %d: %v", t.tid, e))
   146  	}
   147  }
   148  
   149  func (sc *sharedContext) state() sysmsg.ContextState {
   150  	return sc.shared.State.Get()
   151  }
   152  
   153  func (sc *sharedContext) setState(state sysmsg.ContextState) {
   154  	sc.shared.State.Set(state)
   155  }
   156  
   157  func (sc *sharedContext) setInterrupt() {
   158  	atomic.StoreUint32(&sc.shared.Interrupt, 1)
   159  }
   160  
   161  func (sc *sharedContext) clearInterrupt() {
   162  	atomic.StoreUint32(&sc.shared.Interrupt, 0)
   163  }
   164  
   165  func (sc *sharedContext) setFPStateChanged() {
   166  	atomic.StoreUint64(&sc.shared.FPStateChanged, 1)
   167  }
   168  
   169  func (sc *sharedContext) threadID() uint32 {
   170  	return atomic.LoadUint32(&sc.shared.ThreadID)
   171  }
   172  
   173  // EnableSentryFastPath indicates that the polling mode is enabled for the
   174  // Sentry. It has to be called before putting the context into the context queue.
   175  // This function is used if contextDecouplingExp=true because the fastpath
   176  // is negotiated in ThreadContext.
   177  func (sc *sharedContext) enableSentryFastPath() {
   178  	atomic.StoreUint32(&sc.shared.SentryFastPath, 1)
   179  }
   180  
   181  // DisableSentryFastPath indicates that the polling mode for the sentry is
   182  // disabled for the Sentry.
   183  // This function is used if contextDecouplingExp=true because the fastpath
   184  // is negotiated in ThreadContext.
   185  func (sc *sharedContext) disableSentryFastPath() {
   186  	atomic.StoreUint32(&sc.shared.SentryFastPath, 0)
   187  }
   188  
   189  func (sc *sharedContext) isAcked() bool {
   190  	return atomic.LoadUint32(&sc.shared.Acked) != ackReset
   191  }
   192  
   193  func (sc *sharedContext) resetAcked() {
   194  	atomic.StoreUint32(&sc.shared.Acked, ackReset)
   195  }
   196  
   197  const (
   198  	contextPreemptTimeoutNsec = 10 * 1000 * 1000 // 10ms
   199  	contextCheckupTimeoutSec  = 5
   200  	stuckContextTimeout       = 30 * time.Second
   201  )
   202  
   203  func (sc *sharedContext) sleepOnState(state sysmsg.ContextState) {
   204  	timeout := unix.Timespec{
   205  		Sec:  0,
   206  		Nsec: contextPreemptTimeoutNsec,
   207  	}
   208  	sentInterruptOnce := false
   209  	deadline := time.Now().Add(stuckContextTimeout)
   210  	for sc.state() == state {
   211  		errno := sc.shared.SleepOnState(state, &timeout)
   212  		if errno == 0 {
   213  			continue
   214  		}
   215  		if errno != unix.ETIMEDOUT {
   216  			panic(fmt.Sprintf("error waiting for state: %v", errno))
   217  		}
   218  		if time.Now().After(deadline) {
   219  			log.Warningf("Systrap task goroutine has been waiting on ThreadContext.State futex too long. ThreadContext: %v", sc)
   220  		}
   221  		if sentInterruptOnce {
   222  			log.Warningf("The context is still running: %v", sc)
   223  			continue
   224  		}
   225  
   226  		if !sc.isAcked() || sc.subprocess.contextQueue.isEmpty() {
   227  			continue
   228  		}
   229  		sc.NotifyInterrupt()
   230  		sentInterruptOnce = true
   231  		timeout.Sec = contextCheckupTimeoutSec
   232  		timeout.Nsec = 0
   233  	}
   234  }
   235  
   236  type fastPathDispatcher struct {
   237  	// list is used only from the loop method and so it isn't protected by
   238  	// any lock.
   239  	list contextList
   240  
   241  	mu sync.Mutex
   242  
   243  	// nr is the number of contexts in the queue.
   244  	// +checklocks:mu
   245  	nr int
   246  
   247  	// entrants contains new contexts that haven't been added to `list` yet.
   248  	// +checklocks:mu
   249  	entrants contextList
   250  
   251  	// fastPathDisabledTS is the time stamp when the stub fast path was
   252  	// disabled. It is zero if the fast path is enabled.
   253  	fastPathDisabledTS atomic.Uint64
   254  }
   255  
   256  var dispatcher fastPathDispatcher
   257  
   258  // fastPathContextLimit is the maximum number of contexts after which the fast
   259  // path in stub threads is disabled. Its value can be higher than the number of
   260  // CPU-s, because the Sentry is running with higher priority than stub threads,
   261  // deepSleepTimeout is much shorter than the Linux scheduler timeslice, so the
   262  // only thing that matters here is whether the Sentry handles syscall faster
   263  // than the overhead of scheduling another stub thread.
   264  var fastPathContextLimit = uint32(runtime.GOMAXPROCS(0) * 2)
   265  
   266  // fastPathDisabledTimeout is the timeout after which the fast path in stub
   267  // processes will be re-enabled.
   268  const fastPathDisabledTimeout = uint64(200 * 1000 * 1000) // 100ms for 2GHz.
   269  
   270  // nrMaxAwakeStubThreads is the maximum number of awake stub threads over all
   271  // subprocesses at the this moment.
   272  var nrMaxAwakeStubThreads atomic.Uint32
   273  
   274  // stubFastPathEnabled returns true if the fast path in stub processes is
   275  // enabled. If the fast path is disabled, it revises whether it has to be
   276  // re-enabled or not.
   277  func (q *fastPathDispatcher) stubFastPathEnabled() bool {
   278  	ts := q.fastPathDisabledTS.Load()
   279  	if ts != 0 {
   280  		if uint64(cputicks())-ts < fastPathDisabledTimeout {
   281  			return false
   282  		}
   283  		if nrMaxAwakeStubThreads.Load() > fastPathContextLimit {
   284  			q.fastPathDisabledTS.Store(uint64(cputicks()))
   285  			return false
   286  		}
   287  		q.fastPathDisabledTS.Store(0)
   288  	}
   289  	return true
   290  }
   291  
   292  // disableStubFastPath disables the fast path over all subprocesses with active
   293  // contexts.
   294  func (q *fastPathDispatcher) disableStubFastPath() {
   295  	q.fastPathDisabledTS.Store(uint64(cputicks()))
   296  }
   297  
   298  // deep_sleep_timeout is the timeout after which we stops polling and fall asleep.
   299  //
   300  // The value is 40µs for 2GHz CPU. This timeout matches the sentry<->stub round
   301  // trip in the pure deep sleep case.
   302  const deepSleepTimeout = uint64(80000)
   303  const handshakeTimeout = uint64(1000)
   304  
   305  // loop is processing contexts in the queue. Only one instance of it can be
   306  // running, because it has exclusive access to the list.
   307  //
   308  // target is the context associated with the current go-routine.
   309  func (q *fastPathDispatcher) loop(target *sharedContext) {
   310  	done := false
   311  	processed := 0
   312  	slowPath := false
   313  	start := cputicks()
   314  	for {
   315  		var ctx, next *sharedContext
   316  
   317  		q.mu.Lock()
   318  		if processed != 0 || !q.entrants.Empty() {
   319  			start = cputicks()
   320  			slowPath = false
   321  		}
   322  		q.nr -= processed
   323  		// Add new contexts to the list.
   324  		q.list.PushBackList(&q.entrants)
   325  		ctx = q.list.Front()
   326  		q.mu.Unlock()
   327  
   328  		if done {
   329  			if ctx != nil {
   330  				// Wake up the next go-routine to run the loop.
   331  				ctx.sync.Receiver().Notify(sharedContextDispatch)
   332  			}
   333  			break
   334  		}
   335  
   336  		processed = 0
   337  		now := cputicks()
   338  		for ctx = q.list.Front(); ctx != nil; ctx = next {
   339  			next = ctx.Next()
   340  
   341  			event := sharedContextReady
   342  			if ctx.state() == sysmsg.ContextStateNone {
   343  				if slowPath {
   344  					event = sharedContextSlowPath
   345  				} else if !ctx.kicked && uint64(now-ctx.startWaitingTS) > handshakeTimeout {
   346  					if ctx.isAcked() {
   347  						ctx.kicked = true
   348  						continue
   349  					}
   350  					event = sharedContextKicked
   351  				} else {
   352  					continue
   353  				}
   354  			}
   355  			processed++
   356  			q.list.Remove(ctx)
   357  			if ctx == target {
   358  				done = true
   359  			}
   360  			ctx.sync.Receiver().Notify(event)
   361  		}
   362  		if processed == 0 {
   363  			if uint64(cputicks()-start) > deepSleepTimeout {
   364  				slowPath = true
   365  				// Do one more run to notify all contexts.
   366  				// q.list has to be empty at the end.
   367  				continue
   368  			}
   369  			yield()
   370  		}
   371  	}
   372  }
   373  
   374  func (q *fastPathDispatcher) waitFor(ctx *sharedContext) syncevent.Set {
   375  	events := syncevent.Set(0)
   376  
   377  	q.mu.Lock()
   378  	q.entrants.PushBack(ctx)
   379  	q.nr++
   380  	if q.nr == 1 {
   381  		events = sharedContextDispatch
   382  	}
   383  	q.mu.Unlock()
   384  
   385  	for {
   386  		if events&sharedContextDispatch != 0 {
   387  			ctx.sync.Ack(sharedContextDispatch)
   388  			q.loop(ctx)
   389  		}
   390  		events = ctx.sync.WaitAndAckAll()
   391  		if events&sharedContextDispatch == 0 {
   392  			break
   393  		}
   394  	}
   395  	return events
   396  }