github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/fsimpl/lock/lock.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package lock is the API for POSIX-style advisory regional file locks and 16 // BSD-style full file locks. 17 // 18 // Callers needing to enforce these types of locks, like sys_fcntl, can call 19 // LockRegion and UnlockRegion on a thread-safe set of Locks. Locks are 20 // specific to a unique file (unique device/inode pair) and for this reason 21 // should not be shared between files. 22 // 23 // A Lock has a set of holders identified by UniqueID. Normally this is the 24 // pid of the thread attempting to acquire the lock. 25 // 26 // Since these are advisory locks, they do not need to be integrated into 27 // Reads/Writes and for this reason there is no way to *check* if a lock is 28 // held. One can only attempt to take a lock or unlock an existing lock. 29 // 30 // A Lock in a set of Locks is typed: it is either a read lock with any number 31 // of readers and no writer, or a write lock with no readers. 32 // 33 // As expected from POSIX, any attempt to acquire a write lock on a file region 34 // when there already exits a write lock held by a different uid will fail. Any 35 // attempt to acquire a write lock on a file region when there is more than one 36 // reader will fail. Any attempt to acquire a read lock on a file region when 37 // there is already a writer will fail. 38 // 39 // In special cases, a read lock may be upgraded to a write lock and a write lock 40 // can be downgraded to a read lock. This can only happen if: 41 // 42 // - read lock upgrade to write lock: There can be only one reader and the reader 43 // must be the same as the requested write lock holder. 44 // 45 // - write lock downgrade to read lock: The writer must be the same as the requested 46 // read lock holder. 47 // 48 // UnlockRegion always succeeds. If LockRegion fails the caller should normally 49 // interpret this as "try again later". 50 package lock 51 52 import ( 53 "fmt" 54 "math" 55 56 "golang.org/x/sys/unix" 57 "github.com/metacubex/gvisor/pkg/abi/linux" 58 "github.com/metacubex/gvisor/pkg/context" 59 "github.com/metacubex/gvisor/pkg/errors/linuxerr" 60 "github.com/metacubex/gvisor/pkg/sync" 61 "github.com/metacubex/gvisor/pkg/waiter" 62 ) 63 64 // LockType is a type of regional file lock. 65 type LockType int 66 67 // UniqueID is a unique identifier of the holder of a regional file lock. 68 type UniqueID any 69 70 const ( 71 // ReadLock describes a POSIX regional file lock to be taken 72 // read only. There may be multiple of these locks on a single 73 // file region as long as there is no writer lock on the same 74 // region. 75 ReadLock LockType = iota 76 77 // WriteLock describes a POSIX regional file lock to be taken 78 // write only. There may be only a single holder of this lock 79 // and no read locks. 80 WriteLock 81 ) 82 83 // LockEOF is the maximal possible end of a regional file lock. 84 // 85 // A BSD-style full file lock can be represented as a regional file lock from 86 // offset 0 to LockEOF. 87 const LockEOF = math.MaxInt64 88 89 // OwnerInfo describes the owner of a lock. 90 // 91 // +stateify savable 92 type OwnerInfo struct { 93 // PID is the process ID of the lock owner. 94 PID int32 95 // OFD is whether this is an open file descriptor lock. 96 OFD bool 97 } 98 99 // Lock is a regional file lock. It consists of either a single writer 100 // or a set of readers. 101 // 102 // A Lock may be upgraded from a read lock to a write lock only if there 103 // is a single reader and that reader has the same uid as the write lock. 104 // 105 // A Lock may be downgraded from a write lock to a read lock only if 106 // the write lock's uid is the same as the read lock. 107 // 108 // Accesses to Lock are synchronized through the Locks object to which it 109 // belongs. 110 // 111 // +stateify savable 112 type Lock struct { 113 // Readers are the set of read lock holders identified by UniqueID. 114 // If len(Readers) > 0 then Writer must be nil. 115 Readers map[UniqueID]OwnerInfo 116 117 // Writer holds the writer unique ID. It's nil if there are no writers. 118 Writer UniqueID 119 120 // WriterInfo describes the writer. It is only meaningful if Writer != nil. 121 WriterInfo OwnerInfo 122 } 123 124 // Locks is a thread-safe wrapper around a LockSet. 125 // 126 // +stateify savable 127 type Locks struct { 128 // mu protects locks below. 129 mu sync.Mutex `state:"nosave"` 130 131 // locks is the set of region locks currently held on an Inode. 132 locks LockSet 133 134 // blockedQueue is the queue of waiters that are waiting on a lock. 135 blockedQueue waiter.Queue 136 } 137 138 // LockRegion attempts to acquire a typed lock for the uid on a region of a 139 // file. Returns nil if successful in locking the region, otherwise an 140 // appropriate error is returned. 141 func (l *Locks) LockRegion(ctx context.Context, uid UniqueID, ownerPID int32, t LockType, r LockRange, ofd bool, block bool) error { 142 l.mu.Lock() 143 defer l.mu.Unlock() 144 for { 145 146 // Blocking locks must run in a loop because we'll be woken up whenever an unlock event 147 // happens for this lock. We will then attempt to take the lock again and if it fails 148 // continue blocking. 149 err := l.locks.lock(uid, ownerPID, t, r, ofd) 150 if err == linuxerr.ErrWouldBlock && block { 151 // Note: we release the lock in EventRegister below, in 152 // order to avoid a possible race. 153 ok := ctx.BlockOn(l, waiter.EventIn) 154 l.mu.Lock() // +checklocksforce: see above. 155 if ok { 156 continue // Try again now that someone has unlocked. 157 } 158 // Must be interrupted. 159 return linuxerr.ErrInterrupted 160 } 161 162 return err 163 } 164 } 165 166 // Readiness always returns zero. 167 func (l *Locks) Readiness(waiter.EventMask) waiter.EventMask { 168 return 0 169 } 170 171 // EventRegister implements waiter.Waitable.EventRegister. 172 func (l *Locks) EventRegister(e *waiter.Entry) error { 173 defer l.mu.Unlock() // +checklocksforce: see above. 174 l.blockedQueue.EventRegister(e) 175 return nil 176 } 177 178 // EventUnregister implements waiter.Waitable.EventUnregister. 179 func (l *Locks) EventUnregister(e *waiter.Entry) { 180 l.blockedQueue.EventUnregister(e) 181 } 182 183 // UnlockRegion attempts to release a lock for the uid on a region of a file. 184 // This operation is always successful, even if there did not exist a lock on 185 // the requested region held by uid in the first place. 186 func (l *Locks) UnlockRegion(uid UniqueID, r LockRange) { 187 l.mu.Lock() 188 defer l.mu.Unlock() 189 l.locks.unlock(uid, r) 190 191 // Now that we've released the lock, we need to wake up any waiters. 192 // We track how many notifications have happened since the last attempt 193 // to acquire the lock, in order to ensure that we avoid races. 194 l.blockedQueue.Notify(waiter.EventIn) 195 } 196 197 // makeLock returns a new typed Lock that has either uid as its only reader 198 // or uid as its only writer. 199 func makeLock(uid UniqueID, ownerPID int32, t LockType, ofd bool) Lock { 200 value := Lock{Readers: make(map[UniqueID]OwnerInfo)} 201 switch t { 202 case ReadLock: 203 value.Readers[uid] = OwnerInfo{PID: ownerPID, OFD: ofd} 204 case WriteLock: 205 value.Writer = uid 206 value.WriterInfo = OwnerInfo{PID: ownerPID, OFD: ofd} 207 default: 208 panic(fmt.Sprintf("makeLock: invalid lock type %d", t)) 209 } 210 return value 211 } 212 213 // isHeld returns true if uid is a holder of Lock. 214 func (l Lock) isHeld(uid UniqueID) bool { 215 if _, ok := l.Readers[uid]; ok { 216 return true 217 } 218 return l.Writer == uid 219 } 220 221 // lock sets uid as a holder of a typed lock on Lock. 222 // 223 // Preconditions: canLock is true for the range containing this Lock. 224 func (l *Lock) lock(uid UniqueID, ownerPID int32, t LockType, ofd bool) { 225 switch t { 226 case ReadLock: 227 // If we are already a reader, then this is a no-op. 228 if _, ok := l.Readers[uid]; ok { 229 return 230 } 231 // We cannot downgrade a write lock to a read lock unless the 232 // uid is the same. 233 if l.Writer != nil { 234 if l.Writer != uid { 235 panic(fmt.Sprintf("lock: cannot downgrade write lock to read lock for uid %d, writer is %d", uid, l.Writer)) 236 } 237 // Ensure that there is only one reader if upgrading. 238 l.Readers = make(map[UniqueID]OwnerInfo) 239 // Ensure that there is no longer a writer. 240 l.Writer = nil 241 } 242 l.Readers[uid] = OwnerInfo{PID: ownerPID, OFD: ofd} 243 return 244 case WriteLock: 245 // If we are already the writer, then this is a no-op. 246 if l.Writer == uid { 247 return 248 } 249 // We can only upgrade a read lock to a write lock if there 250 // is only one reader and that reader has the same uid as 251 // the write lock. 252 if readers := len(l.Readers); readers > 0 { 253 if readers != 1 { 254 panic(fmt.Sprintf("lock: cannot upgrade read lock to write lock for uid %d, too many readers %v", uid, l.Readers)) 255 } 256 if _, ok := l.Readers[uid]; !ok { 257 panic(fmt.Sprintf("lock: cannot upgrade read lock to write lock for uid %d, conflicting reader %v", uid, l.Readers)) 258 } 259 } 260 // Ensure that there is only a writer. 261 l.Readers = make(map[UniqueID]OwnerInfo) 262 l.Writer = uid 263 l.WriterInfo = OwnerInfo{PID: ownerPID, OFD: ofd} 264 default: 265 panic(fmt.Sprintf("lock: invalid lock type %d", t)) 266 } 267 } 268 269 // lockable returns true if check returns true for every Lock in LockRange. 270 // Further, check should return true if Lock meets the callers requirements 271 // for locking Lock. 272 func (l LockSet) lockable(r LockRange, check func(value Lock) bool) bool { 273 // Get our starting point. 274 seg := l.LowerBoundSegment(r.Start) 275 for seg.Ok() && seg.Start() < r.End { 276 // Note that we don't care about overrunning the end of the 277 // last segment because if everything checks out we'll just 278 // split the last segment. 279 if !check(seg.Value()) { 280 return false 281 } 282 // Jump to the next segment, ignoring gaps, for the same 283 // reason we ignored the first gap. 284 seg = seg.NextSegment() 285 } 286 // No conflict, we can get a lock for uid over the entire range. 287 return true 288 } 289 290 // canLock returns true if uid will be able to take a Lock of type t on the 291 // entire range specified by LockRange. 292 func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool { 293 switch t { 294 case ReadLock: 295 return l.lockable(r, func(value Lock) bool { 296 // If there is no writer, there's no problem adding another reader. 297 if value.Writer == nil { 298 return true 299 } 300 // If there is a writer, then it must be the same uid 301 // in order to downgrade the lock to a read lock. 302 return value.Writer == uid 303 }) 304 case WriteLock: 305 return l.lockable(r, func(value Lock) bool { 306 // If there are only readers. 307 if value.Writer == nil { 308 // Then this uid can only take a write lock if this is a private 309 // upgrade, meaning that the only reader is uid. 310 return value.isOnlyReader(uid) 311 } 312 // If the uid is already a writer on this region, then 313 // adding a write lock would be a no-op. 314 return value.Writer == uid 315 }) 316 default: 317 panic(fmt.Sprintf("canLock: invalid lock type %d", t)) 318 } 319 } 320 321 func (l *Lock) isOnlyReader(uid UniqueID) bool { 322 if len(l.Readers) != 1 { 323 return false 324 } 325 _, ok := l.Readers[uid] 326 return ok 327 } 328 329 // lock returns nil if uid took a lock of type t on the entire range of 330 // LockRange. Otherwise, linuxerr.ErrWouldBlock is returned. 331 // 332 // Preconditions: r.Start <= r.End (will panic otherwise). 333 func (l *LockSet) lock(uid UniqueID, ownerPID int32, t LockType, r LockRange, ofd bool) error { 334 if r.Start > r.End { 335 panic(fmt.Sprintf("lock: r.Start %d > r.End %d", r.Start, r.End)) 336 } 337 338 // Don't attempt to insert anything with a range of 0 and treat this 339 // as a successful no-op. 340 if r.Length() == 0 { 341 return nil 342 } 343 344 // Do a first-pass check. We *could* hold onto the segments we checked 345 // if canLock would return true, but traversing the segment set should 346 // be fast and this keeps things simple. 347 if !l.canLock(uid, t, r) { 348 return linuxerr.ErrWouldBlock 349 } 350 351 // Get our starting point. 352 seg, gap := l.Find(r.Start) 353 if gap.Ok() { 354 // Fill in the gap and get the next segment to modify. 355 seg = l.Insert(gap, gap.Range().Intersect(r), makeLock(uid, ownerPID, t, ofd)).NextSegment() 356 } else if seg.Start() < r.Start { 357 // Get our first segment to modify. 358 _, seg = l.Split(seg, r.Start) 359 } 360 for seg.Ok() && seg.Start() < r.End { 361 // Split the last one if necessary. 362 if seg.End() > r.End { 363 seg, _ = l.SplitUnchecked(seg, r.End) 364 } 365 366 // Set the lock on the segment. This is guaranteed to 367 // always be safe, given canLock above. 368 value := seg.ValuePtr() 369 value.lock(uid, ownerPID, t, ofd) 370 371 // Fill subsequent gaps. 372 gap = seg.NextGap() 373 if gr := gap.Range().Intersect(r); gr.Length() > 0 { 374 seg = l.Insert(gap, gr, makeLock(uid, ownerPID, t, ofd)).NextSegment() 375 } else { 376 seg = gap.NextSegment() 377 } 378 } 379 380 return nil 381 } 382 383 // unlock is always successful. If uid has no locks held for the range LockRange, 384 // unlock is a no-op. 385 // 386 // Preconditions: same as lock. 387 func (l *LockSet) unlock(uid UniqueID, r LockRange) { 388 if r.Start > r.End { 389 panic(fmt.Sprintf("unlock: r.Start %d > r.End %d", r.Start, r.End)) 390 } 391 392 // Same as setlock. 393 if r.Length() == 0 { 394 return 395 } 396 397 // Get our starting point. 398 seg := l.LowerBoundSegment(r.Start) 399 for seg.Ok() && seg.Start() < r.End { 400 // If this segment doesn't have a lock from uid then 401 // there is no need to fragment the set with Isolate (below). 402 // In this case just move on to the next segment. 403 if !seg.Value().isHeld(uid) { 404 seg = seg.NextSegment() 405 continue 406 } 407 408 // Ensure that if we need to unlock a sub-segment that 409 // we don't unlock/remove that entire segment. 410 seg = l.Isolate(seg, r) 411 412 value := seg.Value() 413 var remove bool 414 if value.Writer == uid { 415 // If we are unlocking a writer, then since there can 416 // only ever be one writer and no readers, then this 417 // lock should always be removed from the set. 418 remove = true 419 } else if _, ok := value.Readers[uid]; ok { 420 // If uid is the last reader, then just remove the entire 421 // segment. 422 if len(value.Readers) == 1 { 423 remove = true 424 } else { 425 // Otherwise we need to remove this reader without 426 // affecting any other segment's readers. To do 427 // this, we need to make a copy of the Readers map 428 // and not add this uid. 429 newValue := Lock{Readers: make(map[UniqueID]OwnerInfo)} 430 for k, v := range value.Readers { 431 if k != uid { 432 newValue.Readers[k] = v 433 } 434 } 435 seg.SetValue(newValue) 436 } 437 } 438 if remove { 439 seg = l.Remove(seg).NextSegment() 440 } else { 441 seg = seg.NextSegment() 442 } 443 } 444 } 445 446 // ComputeRange takes a positive file offset and computes the start of a LockRange 447 // using start (relative to offset) and the end of the LockRange using length. The 448 // values of start and length may be negative but the resulting LockRange must 449 // preserve that LockRange.Start < LockRange.End and LockRange.Start > 0. 450 func ComputeRange(start, length, offset int64) (LockRange, error) { 451 offset += start 452 // fcntl(2): "l_start can be a negative number provided the offset 453 // does not lie before the start of the file" 454 if offset < 0 { 455 return LockRange{}, unix.EINVAL 456 } 457 458 // fcntl(2): Specifying 0 for l_len has the special meaning: lock all 459 // bytes starting at the location specified by l_whence and l_start 460 // through to the end of file, no matter how large the file grows. 461 end := uint64(LockEOF) 462 if length > 0 { 463 // fcntl(2): If l_len is positive, then the range to be locked 464 // covers bytes l_start up to and including l_start+l_len-1. 465 // 466 // Since LockRange.End is exclusive we need not -1 from length.. 467 end = uint64(offset + length) 468 } else if length < 0 { 469 // fcntl(2): If l_len is negative, the interval described by 470 // lock covers bytes l_start+l_len up to and including l_start-1. 471 // 472 // Since LockRange.End is exclusive we need not -1 from offset. 473 signedEnd := offset 474 // Add to offset using a negative length (subtract). 475 offset += length 476 if offset < 0 { 477 return LockRange{}, unix.EINVAL 478 } 479 if signedEnd < offset { 480 return LockRange{}, unix.EOVERFLOW 481 } 482 // At this point signedEnd cannot be negative, 483 // since we asserted that offset is not negative 484 // and it is not less than offset. 485 end = uint64(signedEnd) 486 } 487 // Offset is guaranteed to be positive at this point. 488 return LockRange{Start: uint64(offset), End: end}, nil 489 } 490 491 // TestRegion checks whether the lock holder identified by uid can hold a lock 492 // of type t on range r. It returns a Flock struct representing this 493 // information as the F_GETLK fcntl does. 494 // 495 // Note that the PID returned in the flock structure is relative to the root PID 496 // namespace. It needs to be converted to the caller's PID namespace before 497 // returning to userspace. 498 func (l *Locks) TestRegion(ctx context.Context, uid UniqueID, t LockType, r LockRange, ofd bool) linux.Flock { 499 f := linux.Flock{Type: linux.F_UNLCK} 500 switch t { 501 case ReadLock: 502 l.testRegion(r, func(lock Lock, start, length uint64) bool { 503 if lock.Writer == nil || lock.Writer == uid { 504 return true 505 } 506 f.Type = linux.F_WRLCK 507 f.PID = lock.WriterInfo.PID 508 f.Start = int64(start) 509 f.Len = int64(length) 510 return false 511 }) 512 case WriteLock: 513 l.testRegion(r, func(lock Lock, start, length uint64) bool { 514 if lock.Writer == nil { 515 for k, v := range lock.Readers { 516 if k != uid && v.OFD == ofd { 517 // Stop at the first conflict detected. 518 f.Type = linux.F_RDLCK 519 f.PID = v.PID 520 f.Start = int64(start) 521 f.Len = int64(length) 522 return false 523 } 524 } 525 return true 526 } 527 if lock.Writer == uid { 528 return true 529 } 530 f.Type = linux.F_WRLCK 531 f.PID = lock.WriterInfo.PID 532 f.Start = int64(start) 533 f.Len = int64(length) 534 return false 535 }) 536 default: 537 panic(fmt.Sprintf("TestRegion: invalid lock type %d", t)) 538 } 539 return f 540 } 541 542 func (l *Locks) testRegion(r LockRange, check func(lock Lock, start, length uint64) bool) { 543 l.mu.Lock() 544 defer l.mu.Unlock() 545 546 seg := l.locks.LowerBoundSegment(r.Start) 547 for seg.Ok() && seg.Start() < r.End { 548 lock := seg.Value() 549 if !check(lock, seg.Start(), seg.End()-seg.Start()) { 550 // Stop at the first conflict detected. 551 return 552 } 553 seg = seg.NextSegment() 554 } 555 }