gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/platform/systrap/shared_context.go (about) 1 // Copyright 2023 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package systrap 16 17 import ( 18 "fmt" 19 "strconv" 20 "sync" 21 "sync/atomic" 22 "time" 23 24 "golang.org/x/sys/unix" 25 "gvisor.dev/gvisor/pkg/log" 26 "gvisor.dev/gvisor/pkg/sentry/platform" 27 "gvisor.dev/gvisor/pkg/sentry/platform/systrap/sysmsg" 28 "gvisor.dev/gvisor/pkg/syncevent" 29 ) 30 31 const ( 32 ackReset uint64 = 0 33 stateChangedReset uint64 = 0 34 ) 35 36 // sharedContext is an abstraction for interactions that the sentry has to 37 // perform with memory shared between it and the stub threads used for contexts. 38 // 39 // Any access to shared memory should most likely have a getter/setter through 40 // this struct. This is due to the following reasons: 41 // - The memory needs to be read or modified atomically because there is no 42 // (trusted) synchronization between the sentry and the stub processes. 43 // - Data read from shared memory may require validation before it can be used. 44 type sharedContext struct { 45 contextEntry 46 47 // subprocess is the subprocess that this sharedContext instance belongs to. 48 subprocess *subprocess 49 // contextID is the ID corresponding to the sysmsg.ThreadContext memory slot 50 // that is used for this sharedContext. 51 contextID uint32 52 // shared is the handle to the shared memory that the sentry task go-routine 53 // reads from and writes to. 54 // NOTE: Using this handle directly without a getter from this function should 55 // most likely be avoided due to concerns listed above. 56 shared *sysmsg.ThreadContext 57 58 // sync is used by the context go-routine to wait for events from the 59 // dispatcher. 60 sync syncevent.Waiter 61 startWaitingTS int64 62 kicked bool 63 // The task associated with the context fell asleep. 64 sleeping bool 65 } 66 67 // String returns the ID of this shared context. 68 func (sc *sharedContext) String() string { 69 return strconv.Itoa(int(sc.contextID)) 70 } 71 72 const ( 73 // sharedContextReady indicates that a context has new events. 74 sharedContextReady = syncevent.Set(1 << iota) 75 // sharedContextKicked indicates that a new stub thread should be woken up. 76 sharedContextKicked 77 // sharedContextSlowPath indicates that a context has to be waited for in the 78 // slow path. 79 sharedContextSlowPath 80 // sharedContextDispatch indicates that a context go-routine has to start the wait loop. 81 sharedContextDispatch 82 ) 83 84 func (s *subprocess) getSharedContext() (*sharedContext, error) { 85 s.mu.Lock() 86 defer s.mu.Unlock() 87 88 id, ok := s.threadContextPool.Get() 89 if !ok { 90 return nil, fmt.Errorf("subprocess has too many active tasks (%d); failed to create a new one", maxGuestContexts) 91 } 92 s.IncRef() 93 sc := sharedContext{ 94 subprocess: s, 95 contextID: uint32(id), 96 shared: s.getThreadContextFromID(id), 97 } 98 sc.shared.Init(invalidThreadID) 99 sc.sync.Init() 100 sc.sleeping = true 101 102 return &sc, nil 103 } 104 105 func (sc *sharedContext) release() { 106 if sc == nil { 107 return 108 } 109 if !sc.sleeping { 110 sc.subprocess.decAwakeContexts() 111 } 112 sc.subprocess.threadContextPool.Put(uint64(sc.contextID)) 113 sc.subprocess.DecRef(sc.subprocess.release) 114 } 115 116 func (sc *sharedContext) isActiveInSubprocess(s *subprocess) bool { 117 if sc == nil { 118 return false 119 } 120 return sc.subprocess == s 121 } 122 123 // NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt. 124 func (sc *sharedContext) NotifyInterrupt() { 125 // If this context is not being worked on right now we need to mark it as 126 // interrupted so the next executor does not start working on it. 127 atomic.StoreUint32(&sc.shared.Interrupt, 1) 128 if sc.threadID() == invalidThreadID { 129 return 130 } 131 sc.subprocess.sysmsgThreadsMu.Lock() 132 defer sc.subprocess.sysmsgThreadsMu.Unlock() 133 134 threadID := atomic.LoadUint32(&sc.shared.ThreadID) 135 sysmsgThread, ok := sc.subprocess.sysmsgThreads[threadID] 136 if !ok { 137 // This is either an invalidThreadID or another garbage value; either way we 138 // don't know which thread to interrupt; best we can do is mark the context. 139 return 140 } 141 142 t := sysmsgThread.thread 143 if _, _, e := unix.RawSyscall(unix.SYS_TGKILL, uintptr(t.tgid), uintptr(t.tid), uintptr(platform.SignalInterrupt)); e != 0 { 144 panic(fmt.Sprintf("failed to interrupt the child process %d: %v", t.tid, e)) 145 } 146 } 147 148 func (sc *sharedContext) state() sysmsg.ContextState { 149 return sc.shared.State.Get() 150 } 151 152 func (sc *sharedContext) setState(state sysmsg.ContextState) { 153 sc.shared.State.Set(state) 154 } 155 156 func (sc *sharedContext) setInterrupt() { 157 atomic.StoreUint32(&sc.shared.Interrupt, 1) 158 } 159 160 func (sc *sharedContext) clearInterrupt() { 161 atomic.StoreUint32(&sc.shared.Interrupt, 0) 162 } 163 164 func (sc *sharedContext) setFPStateChanged() { 165 atomic.StoreUint64(&sc.shared.FPStateChanged, 1) 166 } 167 168 func (sc *sharedContext) threadID() uint32 { 169 return atomic.LoadUint32(&sc.shared.ThreadID) 170 } 171 172 // EnableSentryFastPath indicates that the polling mode is enabled for the 173 // Sentry. It has to be called before putting the context into the context queue. 174 func (sc *sharedContext) enableSentryFastPath() { 175 atomic.StoreUint32(&sc.shared.SentryFastPath, 1) 176 } 177 178 // DisableSentryFastPath indicates that the polling mode for the sentry is 179 // disabled for the Sentry. 180 func (sc *sharedContext) disableSentryFastPath() { 181 atomic.StoreUint32(&sc.shared.SentryFastPath, 0) 182 } 183 184 func (sc *sharedContext) isAcked() bool { 185 return atomic.LoadUint64(&sc.shared.AckedTime) != ackReset 186 } 187 188 // getAckedTimeDiff returns the time difference between when this context was 189 // put into the context queue, and when this context was acked by a stub thread. 190 // Precondition: must be called after isAcked() == true. 191 // 192 //go:nosplit 193 func (sc *sharedContext) getAckedTimeDiff() cpuTicks { 194 ackedAt := atomic.LoadUint64(&sc.shared.AckedTime) 195 if ackedAt < uint64(sc.startWaitingTS) { 196 log.Infof("likely memory tampering detected: found a condition where ackedAt (%d) < startWaitingTS (%d)", ackedAt, uint64(sc.startWaitingTS)) 197 return 0 198 } 199 return cpuTicks(ackedAt - uint64(sc.startWaitingTS)) 200 } 201 202 // getStateChangedTimeDiff returns the time difference between the time the 203 // context state got changed by a stub thread, and now. 204 // 205 //go:nosplit 206 func (sc *sharedContext) getStateChangedTimeDiff() cpuTicks { 207 changedAt := atomic.LoadUint64(&sc.shared.StateChangedTime) 208 now := uint64(cputicks()) 209 if now < changedAt { 210 log.Infof("likely memory tampering detected: found a condition where now (%d) < changedAt (%d)", now, changedAt) 211 return 0 212 } 213 return cpuTicks(now - changedAt) 214 } 215 216 func (sc *sharedContext) resetLatencyMeasures() { 217 atomic.StoreUint64(&sc.shared.AckedTime, ackReset) 218 atomic.StoreUint64(&sc.shared.StateChangedTime, stateChangedReset) 219 } 220 221 const ( 222 contextPreemptTimeoutNsec = 10 * 1000 * 1000 // 10ms 223 contextCheckupTimeoutSec = 5 224 stuckContextTimeout = 30 * time.Second 225 ) 226 227 var errDeadSubprocess = fmt.Errorf("subprocess died") 228 229 func (sc *sharedContext) sleepOnState(state sysmsg.ContextState) error { 230 timeout := unix.Timespec{ 231 Sec: 0, 232 Nsec: contextPreemptTimeoutNsec, 233 } 234 sentInterruptOnce := false 235 deadline := time.Now().Add(stuckContextTimeout) 236 for sc.state() == state { 237 errno := sc.shared.SleepOnState(state, &timeout) 238 if errno == 0 { 239 continue 240 } 241 if errno != unix.ETIMEDOUT { 242 panic(fmt.Sprintf("error waiting for state: %v", errno)) 243 } 244 if !sc.subprocess.alive() { 245 return errDeadSubprocess 246 } 247 if time.Now().After(deadline) { 248 log.Warningf("Systrap task goroutine has been waiting on ThreadContext.State futex too long. ThreadContext: %v", sc) 249 } 250 if sentInterruptOnce { 251 log.Warningf("The context is still running: %v", sc) 252 continue 253 } 254 255 if !sc.isAcked() || sc.subprocess.contextQueue.isEmpty() { 256 continue 257 } 258 sc.NotifyInterrupt() 259 sentInterruptOnce = true 260 timeout.Sec = contextCheckupTimeoutSec 261 timeout.Nsec = 0 262 } 263 return nil 264 } 265 266 type fastPathDispatcher struct { 267 // list is used only from the loop method and so it isn't protected by 268 // any lock. 269 list contextList 270 271 mu sync.Mutex 272 273 // nr is the number of contexts in the queue. 274 // +checklocks:mu 275 nr int 276 277 // entrants contains new contexts that haven't been added to `list` yet. 278 // +checklocks:mu 279 entrants contextList 280 } 281 282 var dispatcher fastPathDispatcher 283 284 const ( 285 // deepSleepTimeout is the timeout after which both stub threads and the 286 // dispatcher consider whether to stop polling. They need to have elapsed 287 // this timeout twice in a row in order to stop, so the actual timeout 288 // can be considered to be (deepSleepTimeout*2). Falling asleep after two 289 // shorter timeouts instead of one long timeout is done in order to 290 // mitigate the effects of rdtsc inaccuracies. 291 // 292 // The value is 20µs for 2GHz CPU. 40µs matches the sentry<->stub 293 // round trip in the pure deep sleep case. 294 deepSleepTimeout = uint64(40000) 295 handshakeTimeout = uint64(1000) 296 ) 297 298 // loop is processing contexts in the queue. Only one instance of it can be 299 // running, because it has exclusive access to the list. 300 // 301 // target is the context associated with the current go-routine. 302 func (q *fastPathDispatcher) loop(target *sharedContext) { 303 done := false 304 processed := 0 305 firstTimeout := false 306 slowPath := false 307 startedSpinning := cputicks() 308 for { 309 var ctx, next *sharedContext 310 311 q.mu.Lock() 312 q.nr -= processed 313 // Add new contexts to the list. 314 q.list.PushBackList(&q.entrants) 315 ctx = q.list.Front() 316 q.mu.Unlock() 317 318 if done { 319 if ctx != nil { 320 // Wake up the next go-routine to run the loop. 321 ctx.sync.Receiver().Notify(sharedContextDispatch) 322 } 323 break 324 } 325 326 slowPath = !fastpath.sentryFastPath() || slowPath 327 processed = 0 328 now := cputicks() 329 for ctx = q.list.Front(); ctx != nil; ctx = next { 330 next = ctx.Next() 331 332 event := sharedContextReady 333 if ctx.state() == sysmsg.ContextStateNone { 334 if slowPath { 335 event = sharedContextSlowPath 336 } else if !ctx.kicked && uint64(now-ctx.startWaitingTS) > handshakeTimeout { 337 if ctx.isAcked() { 338 ctx.kicked = true 339 continue 340 } 341 event = sharedContextKicked 342 } else { 343 continue 344 } 345 } 346 processed++ 347 q.list.Remove(ctx) 348 if ctx == target { 349 done = true 350 } 351 ctx.sync.Receiver().Notify(event) 352 } 353 354 if processed != 0 { 355 startedSpinning = now 356 firstTimeout = false 357 } else { 358 fastpath.usedSentryFastPath.Store(true) 359 } 360 // If dispatcher has been spinning for too long, send this 361 // dispatcher to sleep. 362 if uint64(now-startedSpinning) > deepSleepTimeout { 363 slowPath = firstTimeout 364 firstTimeout = true 365 } 366 367 yield() 368 } 369 } 370 371 func (q *fastPathDispatcher) waitFor(ctx *sharedContext) syncevent.Set { 372 events := syncevent.NoEvents 373 374 q.mu.Lock() 375 q.entrants.PushBack(ctx) 376 q.nr++ 377 if q.nr == 1 { 378 events = sharedContextDispatch 379 } 380 q.mu.Unlock() 381 382 for { 383 if events&sharedContextDispatch != 0 { 384 ctx.sync.Ack(sharedContextDispatch) 385 q.loop(ctx) 386 } 387 events = ctx.sync.WaitAndAckAll() 388 if events&sharedContextDispatch == 0 { 389 break 390 } 391 } 392 return events 393 }