github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/fs/lock/lock.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package lock is the API for POSIX-style advisory regional file locks and 16 // BSD-style full file locks. 17 // 18 // Callers needing to enforce these types of locks, like sys_fcntl, can call 19 // LockRegion and UnlockRegion on a thread-safe set of Locks. Locks are 20 // specific to a unique file (unique device/inode pair) and for this reason 21 // should not be shared between files. 22 // 23 // A Lock has a set of holders identified by UniqueID. Normally this is the 24 // pid of the thread attempting to acquire the lock. 25 // 26 // Since these are advisory locks, they do not need to be integrated into 27 // Reads/Writes and for this reason there is no way to *check* if a lock is 28 // held. One can only attempt to take a lock or unlock an existing lock. 29 // 30 // A Lock in a set of Locks is typed: it is either a read lock with any number 31 // of readers and no writer, or a write lock with no readers. 32 // 33 // As expected from POSIX, any attempt to acquire a write lock on a file region 34 // when there already exits a write lock held by a different uid will fail. Any 35 // attempt to acquire a write lock on a file region when there is more than one 36 // reader will fail. Any attempt to acquire a read lock on a file region when 37 // there is already a writer will fail. 38 // 39 // In special cases, a read lock may be upgraded to a write lock and a write lock 40 // can be downgraded to a read lock. This can only happen if: 41 // 42 // * read lock upgrade to write lock: There can be only one reader and the reader 43 // must be the same as the requested write lock holder. 44 // 45 // * write lock downgrade to read lock: The writer must be the same as the requested 46 // read lock holder. 47 // 48 // UnlockRegion always succeeds. If LockRegion fails the caller should normally 49 // interpret this as "try again later". 50 package lock 51 52 import ( 53 "fmt" 54 "math" 55 56 "golang.org/x/sys/unix" 57 "github.com/SagerNet/gvisor/pkg/abi/linux" 58 "github.com/SagerNet/gvisor/pkg/context" 59 "github.com/SagerNet/gvisor/pkg/sync" 60 "github.com/SagerNet/gvisor/pkg/waiter" 61 ) 62 63 // LockType is a type of regional file lock. 64 type LockType int 65 66 // UniqueID is a unique identifier of the holder of a regional file lock. 67 type UniqueID interface{} 68 69 const ( 70 // ReadLock describes a POSIX regional file lock to be taken 71 // read only. There may be multiple of these locks on a single 72 // file region as long as there is no writer lock on the same 73 // region. 74 ReadLock LockType = iota 75 76 // WriteLock describes a POSIX regional file lock to be taken 77 // write only. There may be only a single holder of this lock 78 // and no read locks. 79 WriteLock 80 ) 81 82 // LockEOF is the maximal possible end of a regional file lock. 83 // 84 // A BSD-style full file lock can be represented as a regional file lock from 85 // offset 0 to LockEOF. 86 const LockEOF = math.MaxUint64 87 88 // OwnerInfo describes the owner of a lock. 89 // 90 // TODO(github.com/SagerNet/issue/5264): We may need to add other fields in the future 91 // (e.g., Linux's file_lock.fl_flags to support open file-descriptor locks). 92 // 93 // +stateify savable 94 type OwnerInfo struct { 95 // PID is the process ID of the lock owner. 96 PID int32 97 } 98 99 // Lock is a regional file lock. It consists of either a single writer 100 // or a set of readers. 101 // 102 // A Lock may be upgraded from a read lock to a write lock only if there 103 // is a single reader and that reader has the same uid as the write lock. 104 // 105 // A Lock may be downgraded from a write lock to a read lock only if 106 // the write lock's uid is the same as the read lock. 107 // 108 // Accesses to Lock are synchronized through the Locks object to which it 109 // belongs. 110 // 111 // +stateify savable 112 type Lock struct { 113 // Readers are the set of read lock holders identified by UniqueID. 114 // If len(Readers) > 0 then Writer must be nil. 115 Readers map[UniqueID]OwnerInfo 116 117 // Writer holds the writer unique ID. It's nil if there are no writers. 118 Writer UniqueID 119 120 // WriterInfo describes the writer. It is only meaningful if Writer != nil. 121 WriterInfo OwnerInfo 122 } 123 124 // Locks is a thread-safe wrapper around a LockSet. 125 // 126 // +stateify savable 127 type Locks struct { 128 // mu protects locks below. 129 mu sync.Mutex `state:"nosave"` 130 131 // locks is the set of region locks currently held on an Inode. 132 locks LockSet 133 134 // blockedQueue is the queue of waiters that are waiting on a lock. 135 blockedQueue waiter.Queue `state:"zerovalue"` 136 } 137 138 // Blocker is the interface used for blocking locks. Passing a nil Blocker 139 // will be treated as non-blocking. 140 type Blocker interface { 141 Block(C <-chan struct{}) error 142 } 143 144 const ( 145 // EventMaskAll is the mask we will always use for locks, by using the 146 // same mask all the time we can wake up everyone anytime the lock 147 // changes state. 148 EventMaskAll waiter.EventMask = 0xFFFF 149 ) 150 151 // LockRegion attempts to acquire a typed lock for the uid on a region 152 // of a file. Returns true if successful in locking the region. If false 153 // is returned, the caller should normally interpret this as "try again later" if 154 // acquiring the lock in a non-blocking mode or "interrupted" if in a blocking mode. 155 // Blocker is the interface used to provide blocking behavior, passing a nil Blocker 156 // will result in non-blocking behavior. 157 func (l *Locks) LockRegion(uid UniqueID, ownerPID int32, t LockType, r LockRange, block Blocker) bool { 158 for { 159 l.mu.Lock() 160 161 // Blocking locks must run in a loop because we'll be woken up whenever an unlock event 162 // happens for this lock. We will then attempt to take the lock again and if it fails 163 // continue blocking. 164 res := l.locks.lock(uid, ownerPID, t, r) 165 if !res && block != nil { 166 e, ch := waiter.NewChannelEntry(nil) 167 l.blockedQueue.EventRegister(&e, EventMaskAll) 168 l.mu.Unlock() 169 if err := block.Block(ch); err != nil { 170 // We were interrupted, the caller can translate this to EINTR if applicable. 171 l.blockedQueue.EventUnregister(&e) 172 return false 173 } 174 l.blockedQueue.EventUnregister(&e) 175 continue // Try again now that someone has unlocked. 176 } 177 178 l.mu.Unlock() 179 return res 180 } 181 } 182 183 // LockRegionVFS1 is a wrapper around LockRegion for VFS1, which does not implement 184 // F_GETLK (and does not care about storing PIDs as a result). 185 // 186 // TODO(github.com/SagerNet/issue/1624): Delete. 187 func (l *Locks) LockRegionVFS1(uid UniqueID, t LockType, r LockRange, block Blocker) bool { 188 return l.LockRegion(uid, 0 /* ownerPID */, t, r, block) 189 } 190 191 // UnlockRegion attempts to release a lock for the uid on a region of a file. 192 // This operation is always successful, even if there did not exist a lock on 193 // the requested region held by uid in the first place. 194 func (l *Locks) UnlockRegion(uid UniqueID, r LockRange) { 195 l.mu.Lock() 196 defer l.mu.Unlock() 197 l.locks.unlock(uid, r) 198 199 // Now that we've released the lock, we need to wake up any waiters. 200 l.blockedQueue.Notify(EventMaskAll) 201 } 202 203 // makeLock returns a new typed Lock that has either uid as its only reader 204 // or uid as its only writer. 205 func makeLock(uid UniqueID, ownerPID int32, t LockType) Lock { 206 value := Lock{Readers: make(map[UniqueID]OwnerInfo)} 207 switch t { 208 case ReadLock: 209 value.Readers[uid] = OwnerInfo{PID: ownerPID} 210 case WriteLock: 211 value.Writer = uid 212 value.WriterInfo = OwnerInfo{PID: ownerPID} 213 default: 214 panic(fmt.Sprintf("makeLock: invalid lock type %d", t)) 215 } 216 return value 217 } 218 219 // isHeld returns true if uid is a holder of Lock. 220 func (l Lock) isHeld(uid UniqueID) bool { 221 if _, ok := l.Readers[uid]; ok { 222 return true 223 } 224 return l.Writer == uid 225 } 226 227 // lock sets uid as a holder of a typed lock on Lock. 228 // 229 // Preconditions: canLock is true for the range containing this Lock. 230 func (l *Lock) lock(uid UniqueID, ownerPID int32, t LockType) { 231 switch t { 232 case ReadLock: 233 // If we are already a reader, then this is a no-op. 234 if _, ok := l.Readers[uid]; ok { 235 return 236 } 237 // We cannot downgrade a write lock to a read lock unless the 238 // uid is the same. 239 if l.Writer != nil { 240 if l.Writer != uid { 241 panic(fmt.Sprintf("lock: cannot downgrade write lock to read lock for uid %d, writer is %d", uid, l.Writer)) 242 } 243 // Ensure that there is only one reader if upgrading. 244 l.Readers = make(map[UniqueID]OwnerInfo) 245 // Ensure that there is no longer a writer. 246 l.Writer = nil 247 } 248 l.Readers[uid] = OwnerInfo{PID: ownerPID} 249 return 250 case WriteLock: 251 // If we are already the writer, then this is a no-op. 252 if l.Writer == uid { 253 return 254 } 255 // We can only upgrade a read lock to a write lock if there 256 // is only one reader and that reader has the same uid as 257 // the write lock. 258 if readers := len(l.Readers); readers > 0 { 259 if readers != 1 { 260 panic(fmt.Sprintf("lock: cannot upgrade read lock to write lock for uid %d, too many readers %v", uid, l.Readers)) 261 } 262 if _, ok := l.Readers[uid]; !ok { 263 panic(fmt.Sprintf("lock: cannot upgrade read lock to write lock for uid %d, conflicting reader %v", uid, l.Readers)) 264 } 265 } 266 // Ensure that there is only a writer. 267 l.Readers = make(map[UniqueID]OwnerInfo) 268 l.Writer = uid 269 l.WriterInfo = OwnerInfo{PID: ownerPID} 270 default: 271 panic(fmt.Sprintf("lock: invalid lock type %d", t)) 272 } 273 } 274 275 // lockable returns true if check returns true for every Lock in LockRange. 276 // Further, check should return true if Lock meets the callers requirements 277 // for locking Lock. 278 func (l LockSet) lockable(r LockRange, check func(value Lock) bool) bool { 279 // Get our starting point. 280 seg := l.LowerBoundSegment(r.Start) 281 for seg.Ok() && seg.Start() < r.End { 282 // Note that we don't care about overrunning the end of the 283 // last segment because if everything checks out we'll just 284 // split the last segment. 285 if !check(seg.Value()) { 286 return false 287 } 288 // Jump to the next segment, ignoring gaps, for the same 289 // reason we ignored the first gap. 290 seg = seg.NextSegment() 291 } 292 // No conflict, we can get a lock for uid over the entire range. 293 return true 294 } 295 296 // canLock returns true if uid will be able to take a Lock of type t on the 297 // entire range specified by LockRange. 298 func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool { 299 switch t { 300 case ReadLock: 301 return l.lockable(r, func(value Lock) bool { 302 // If there is no writer, there's no problem adding another reader. 303 if value.Writer == nil { 304 return true 305 } 306 // If there is a writer, then it must be the same uid 307 // in order to downgrade the lock to a read lock. 308 return value.Writer == uid 309 }) 310 case WriteLock: 311 return l.lockable(r, func(value Lock) bool { 312 // If there are only readers. 313 if value.Writer == nil { 314 // Then this uid can only take a write lock if this is a private 315 // upgrade, meaning that the only reader is uid. 316 return value.isOnlyReader(uid) 317 } 318 // If the uid is already a writer on this region, then 319 // adding a write lock would be a no-op. 320 return value.Writer == uid 321 }) 322 default: 323 panic(fmt.Sprintf("canLock: invalid lock type %d", t)) 324 } 325 } 326 327 func (l *Lock) isOnlyReader(uid UniqueID) bool { 328 if len(l.Readers) != 1 { 329 return false 330 } 331 _, ok := l.Readers[uid] 332 return ok 333 } 334 335 // lock returns true if uid took a lock of type t on the entire range of 336 // LockRange. 337 // 338 // Preconditions: r.Start <= r.End (will panic otherwise). 339 func (l *LockSet) lock(uid UniqueID, ownerPID int32, t LockType, r LockRange) bool { 340 if r.Start > r.End { 341 panic(fmt.Sprintf("lock: r.Start %d > r.End %d", r.Start, r.End)) 342 } 343 344 // Don't attempt to insert anything with a range of 0 and treat this 345 // as a successful no-op. 346 if r.Length() == 0 { 347 return true 348 } 349 350 // Do a first-pass check. We *could* hold onto the segments we 351 // checked if canLock would return true, but traversing the segment 352 // set should be fast and this keeps things simple. 353 if !l.canLock(uid, t, r) { 354 return false 355 } 356 // Get our starting point. 357 seg, gap := l.Find(r.Start) 358 if gap.Ok() { 359 // Fill in the gap and get the next segment to modify. 360 seg = l.Insert(gap, gap.Range().Intersect(r), makeLock(uid, ownerPID, t)).NextSegment() 361 } else if seg.Start() < r.Start { 362 // Get our first segment to modify. 363 _, seg = l.Split(seg, r.Start) 364 } 365 for seg.Ok() && seg.Start() < r.End { 366 // Split the last one if necessary. 367 if seg.End() > r.End { 368 seg, _ = l.SplitUnchecked(seg, r.End) 369 } 370 371 // Set the lock on the segment. This is guaranteed to 372 // always be safe, given canLock above. 373 value := seg.ValuePtr() 374 value.lock(uid, ownerPID, t) 375 376 // Fill subsequent gaps. 377 gap = seg.NextGap() 378 if gr := gap.Range().Intersect(r); gr.Length() > 0 { 379 seg = l.Insert(gap, gr, makeLock(uid, ownerPID, t)).NextSegment() 380 } else { 381 seg = gap.NextSegment() 382 } 383 } 384 return true 385 } 386 387 // unlock is always successful. If uid has no locks held for the range LockRange, 388 // unlock is a no-op. 389 // 390 // Preconditions: same as lock. 391 func (l *LockSet) unlock(uid UniqueID, r LockRange) { 392 if r.Start > r.End { 393 panic(fmt.Sprintf("unlock: r.Start %d > r.End %d", r.Start, r.End)) 394 } 395 396 // Same as setlock. 397 if r.Length() == 0 { 398 return 399 } 400 401 // Get our starting point. 402 seg := l.LowerBoundSegment(r.Start) 403 for seg.Ok() && seg.Start() < r.End { 404 // If this segment doesn't have a lock from uid then 405 // there is no need to fragment the set with Isolate (below). 406 // In this case just move on to the next segment. 407 if !seg.Value().isHeld(uid) { 408 seg = seg.NextSegment() 409 continue 410 } 411 412 // Ensure that if we need to unlock a sub-segment that 413 // we don't unlock/remove that entire segment. 414 seg = l.Isolate(seg, r) 415 416 value := seg.Value() 417 var remove bool 418 if value.Writer == uid { 419 // If we are unlocking a writer, then since there can 420 // only ever be one writer and no readers, then this 421 // lock should always be removed from the set. 422 remove = true 423 } else if _, ok := value.Readers[uid]; ok { 424 // If uid is the last reader, then just remove the entire 425 // segment. 426 if len(value.Readers) == 1 { 427 remove = true 428 } else { 429 // Otherwise we need to remove this reader without 430 // affecting any other segment's readers. To do 431 // this, we need to make a copy of the Readers map 432 // and not add this uid. 433 newValue := Lock{Readers: make(map[UniqueID]OwnerInfo)} 434 for k, v := range value.Readers { 435 if k != uid { 436 newValue.Readers[k] = v 437 } 438 } 439 seg.SetValue(newValue) 440 } 441 } 442 if remove { 443 seg = l.Remove(seg).NextSegment() 444 } else { 445 seg = seg.NextSegment() 446 } 447 } 448 } 449 450 // ComputeRange takes a positive file offset and computes the start of a LockRange 451 // using start (relative to offset) and the end of the LockRange using length. The 452 // values of start and length may be negative but the resulting LockRange must 453 // preserve that LockRange.Start < LockRange.End and LockRange.Start > 0. 454 func ComputeRange(start, length, offset int64) (LockRange, error) { 455 offset += start 456 // fcntl(2): "l_start can be a negative number provided the offset 457 // does not lie before the start of the file" 458 if offset < 0 { 459 return LockRange{}, unix.EINVAL 460 } 461 462 // fcntl(2): Specifying 0 for l_len has the special meaning: lock all 463 // bytes starting at the location specified by l_whence and l_start 464 // through to the end of file, no matter how large the file grows. 465 end := uint64(LockEOF) 466 if length > 0 { 467 // fcntl(2): If l_len is positive, then the range to be locked 468 // covers bytes l_start up to and including l_start+l_len-1. 469 // 470 // Since LockRange.End is exclusive we need not -1 from length.. 471 end = uint64(offset + length) 472 } else if length < 0 { 473 // fcntl(2): If l_len is negative, the interval described by 474 // lock covers bytes l_start+l_len up to and including l_start-1. 475 // 476 // Since LockRange.End is exclusive we need not -1 from offset. 477 signedEnd := offset 478 // Add to offset using a negative length (subtract). 479 offset += length 480 if offset < 0 { 481 return LockRange{}, unix.EINVAL 482 } 483 if signedEnd < offset { 484 return LockRange{}, unix.EOVERFLOW 485 } 486 // At this point signedEnd cannot be negative, 487 // since we asserted that offset is not negative 488 // and it is not less than offset. 489 end = uint64(signedEnd) 490 } 491 // Offset is guaranteed to be positive at this point. 492 return LockRange{Start: uint64(offset), End: end}, nil 493 } 494 495 // TestRegion checks whether the lock holder identified by uid can hold a lock 496 // of type t on range r. It returns a Flock struct representing this 497 // information as the F_GETLK fcntl does. 498 // 499 // Note that the PID returned in the flock structure is relative to the root PID 500 // namespace. It needs to be converted to the caller's PID namespace before 501 // returning to userspace. 502 // 503 // TODO(github.com/SagerNet/issue/5264): we don't support OFD locks through fcntl, which 504 // would return a struct with pid = -1. 505 func (l *Locks) TestRegion(ctx context.Context, uid UniqueID, t LockType, r LockRange) linux.Flock { 506 f := linux.Flock{Type: linux.F_UNLCK} 507 switch t { 508 case ReadLock: 509 l.testRegion(r, func(lock Lock, start, length uint64) bool { 510 if lock.Writer == nil || lock.Writer == uid { 511 return true 512 } 513 f.Type = linux.F_WRLCK 514 f.PID = lock.WriterInfo.PID 515 f.Start = int64(start) 516 f.Len = int64(length) 517 return false 518 }) 519 case WriteLock: 520 l.testRegion(r, func(lock Lock, start, length uint64) bool { 521 if lock.Writer == nil { 522 for k, v := range lock.Readers { 523 if k != uid { 524 // Stop at the first conflict detected. 525 f.Type = linux.F_RDLCK 526 f.PID = v.PID 527 f.Start = int64(start) 528 f.Len = int64(length) 529 return false 530 } 531 } 532 return true 533 } 534 if lock.Writer == uid { 535 return true 536 } 537 f.Type = linux.F_WRLCK 538 f.PID = lock.WriterInfo.PID 539 f.Start = int64(start) 540 f.Len = int64(length) 541 return false 542 }) 543 default: 544 panic(fmt.Sprintf("TestRegion: invalid lock type %d", t)) 545 } 546 return f 547 } 548 549 func (l *Locks) testRegion(r LockRange, check func(lock Lock, start, length uint64) bool) { 550 l.mu.Lock() 551 defer l.mu.Unlock() 552 553 seg := l.locks.LowerBoundSegment(r.Start) 554 for seg.Ok() && seg.Start() < r.End { 555 lock := seg.Value() 556 if !check(lock, seg.Start(), seg.End()-seg.Start()) { 557 // Stop at the first conflict detected. 558 return 559 } 560 seg = seg.NextSegment() 561 } 562 }