github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/fs/lock/lock.go

github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/fs/lock/lock.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package lock is the API for POSIX-style advisory regional file locks and
    16  // BSD-style full file locks.
    17  //
    18  // Callers needing to enforce these types of locks, like sys_fcntl, can call
    19  // LockRegion and UnlockRegion on a thread-safe set of Locks.  Locks are
    20  // specific to a unique file (unique device/inode pair) and for this reason
    21  // should not be shared between files.
    22  //
    23  // A Lock has a set of holders identified by UniqueID.  Normally this is the
    24  // pid of the thread attempting to acquire the lock.
    25  //
    26  // Since these are advisory locks, they do not need to be integrated into
    27  // Reads/Writes and for this reason there is no way to *check* if a lock is
    28  // held.  One can only attempt to take a lock or unlock an existing lock.
    29  //
    30  // A Lock in a set of Locks is typed: it is either a read lock with any number
    31  // of readers and no writer, or a write lock with no readers.
    32  //
    33  // As expected from POSIX, any attempt to acquire a write lock on a file region
    34  // when there already exits a write lock held by a different uid will fail. Any
    35  // attempt to acquire a write lock on a file region when there is more than one
    36  // reader will fail.  Any attempt to acquire a read lock on a file region when
    37  // there is already a writer will fail.
    38  //
    39  // In special cases, a read lock may be upgraded to a write lock and a write lock
    40  // can be downgraded to a read lock.  This can only happen if:
    41  //
    42  //  * read lock upgrade to write lock: There can be only one reader and the reader
    43  //    must be the same as the requested write lock holder.
    44  //
    45  //  * write lock downgrade to read lock: The writer must be the same as the requested
    46  //    read lock holder.
    47  //
    48  // UnlockRegion always succeeds.  If LockRegion fails the caller should normally
    49  // interpret this as "try again later".
    50  package lock
    51  
    52  import (
    53  	"fmt"
    54  	"math"
    55  
    56  	"golang.org/x/sys/unix"
    57  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    58  	"github.com/SagerNet/gvisor/pkg/context"
    59  	"github.com/SagerNet/gvisor/pkg/sync"
    60  	"github.com/SagerNet/gvisor/pkg/waiter"
    61  )
    62  
    63  // LockType is a type of regional file lock.
    64  type LockType int
    65  
    66  // UniqueID is a unique identifier of the holder of a regional file lock.
    67  type UniqueID interface{}
    68  
    69  const (
    70  	// ReadLock describes a POSIX regional file lock to be taken
    71  	// read only.  There may be multiple of these locks on a single
    72  	// file region as long as there is no writer lock on the same
    73  	// region.
    74  	ReadLock LockType = iota
    75  
    76  	// WriteLock describes a POSIX regional file lock to be taken
    77  	// write only.  There may be only a single holder of this lock
    78  	// and no read locks.
    79  	WriteLock
    80  )
    81  
    82  // LockEOF is the maximal possible end of a regional file lock.
    83  //
    84  // A BSD-style full file lock can be represented as a regional file lock from
    85  // offset 0 to LockEOF.
    86  const LockEOF = math.MaxUint64
    87  
    88  // OwnerInfo describes the owner of a lock.
    89  //
    90  // TODO(github.com/SagerNet/issue/5264): We may need to add other fields in the future
    91  // (e.g., Linux's file_lock.fl_flags to support open file-descriptor locks).
    92  //
    93  // +stateify savable
    94  type OwnerInfo struct {
    95  	// PID is the process ID of the lock owner.
    96  	PID int32
    97  }
    98  
    99  // Lock is a regional file lock.  It consists of either a single writer
   100  // or a set of readers.
   101  //
   102  // A Lock may be upgraded from a read lock to a write lock only if there
   103  // is a single reader and that reader has the same uid as the write lock.
   104  //
   105  // A Lock may be downgraded from a write lock to a read lock only if
   106  // the write lock's uid is the same as the read lock.
   107  //
   108  // Accesses to Lock are synchronized through the Locks object to which it
   109  // belongs.
   110  //
   111  // +stateify savable
   112  type Lock struct {
   113  	// Readers are the set of read lock holders identified by UniqueID.
   114  	// If len(Readers) > 0 then Writer must be nil.
   115  	Readers map[UniqueID]OwnerInfo
   116  
   117  	// Writer holds the writer unique ID. It's nil if there are no writers.
   118  	Writer UniqueID
   119  
   120  	// WriterInfo describes the writer. It is only meaningful if Writer != nil.
   121  	WriterInfo OwnerInfo
   122  }
   123  
   124  // Locks is a thread-safe wrapper around a LockSet.
   125  //
   126  // +stateify savable
   127  type Locks struct {
   128  	// mu protects locks below.
   129  	mu sync.Mutex `state:"nosave"`
   130  
   131  	// locks is the set of region locks currently held on an Inode.
   132  	locks LockSet
   133  
   134  	// blockedQueue is the queue of waiters that are waiting on a lock.
   135  	blockedQueue waiter.Queue `state:"zerovalue"`
   136  }
   137  
   138  // Blocker is the interface used for blocking locks. Passing a nil Blocker
   139  // will be treated as non-blocking.
   140  type Blocker interface {
   141  	Block(C <-chan struct{}) error
   142  }
   143  
   144  const (
   145  	// EventMaskAll is the mask we will always use for locks, by using the
   146  	// same mask all the time we can wake up everyone anytime the lock
   147  	// changes state.
   148  	EventMaskAll waiter.EventMask = 0xFFFF
   149  )
   150  
   151  // LockRegion attempts to acquire a typed lock for the uid on a region
   152  // of a file. Returns true if successful in locking the region. If false
   153  // is returned, the caller should normally interpret this as "try again later" if
   154  // acquiring the lock in a non-blocking mode or "interrupted" if in a blocking mode.
   155  // Blocker is the interface used to provide blocking behavior, passing a nil Blocker
   156  // will result in non-blocking behavior.
   157  func (l *Locks) LockRegion(uid UniqueID, ownerPID int32, t LockType, r LockRange, block Blocker) bool {
   158  	for {
   159  		l.mu.Lock()
   160  
   161  		// Blocking locks must run in a loop because we'll be woken up whenever an unlock event
   162  		// happens for this lock. We will then attempt to take the lock again and if it fails
   163  		// continue blocking.
   164  		res := l.locks.lock(uid, ownerPID, t, r)
   165  		if !res && block != nil {
   166  			e, ch := waiter.NewChannelEntry(nil)
   167  			l.blockedQueue.EventRegister(&e, EventMaskAll)
   168  			l.mu.Unlock()
   169  			if err := block.Block(ch); err != nil {
   170  				// We were interrupted, the caller can translate this to EINTR if applicable.
   171  				l.blockedQueue.EventUnregister(&e)
   172  				return false
   173  			}
   174  			l.blockedQueue.EventUnregister(&e)
   175  			continue // Try again now that someone has unlocked.
   176  		}
   177  
   178  		l.mu.Unlock()
   179  		return res
   180  	}
   181  }
   182  
   183  // LockRegionVFS1 is a wrapper around LockRegion for VFS1, which does not implement
   184  // F_GETLK (and does not care about storing PIDs as a result).
   185  //
   186  // TODO(github.com/SagerNet/issue/1624): Delete.
   187  func (l *Locks) LockRegionVFS1(uid UniqueID, t LockType, r LockRange, block Blocker) bool {
   188  	return l.LockRegion(uid, 0 /* ownerPID */, t, r, block)
   189  }
   190  
   191  // UnlockRegion attempts to release a lock for the uid on a region of a file.
   192  // This operation is always successful, even if there did not exist a lock on
   193  // the requested region held by uid in the first place.
   194  func (l *Locks) UnlockRegion(uid UniqueID, r LockRange) {
   195  	l.mu.Lock()
   196  	defer l.mu.Unlock()
   197  	l.locks.unlock(uid, r)
   198  
   199  	// Now that we've released the lock, we need to wake up any waiters.
   200  	l.blockedQueue.Notify(EventMaskAll)
   201  }
   202  
   203  // makeLock returns a new typed Lock that has either uid as its only reader
   204  // or uid as its only writer.
   205  func makeLock(uid UniqueID, ownerPID int32, t LockType) Lock {
   206  	value := Lock{Readers: make(map[UniqueID]OwnerInfo)}
   207  	switch t {
   208  	case ReadLock:
   209  		value.Readers[uid] = OwnerInfo{PID: ownerPID}
   210  	case WriteLock:
   211  		value.Writer = uid
   212  		value.WriterInfo = OwnerInfo{PID: ownerPID}
   213  	default:
   214  		panic(fmt.Sprintf("makeLock: invalid lock type %d", t))
   215  	}
   216  	return value
   217  }
   218  
   219  // isHeld returns true if uid is a holder of Lock.
   220  func (l Lock) isHeld(uid UniqueID) bool {
   221  	if _, ok := l.Readers[uid]; ok {
   222  		return true
   223  	}
   224  	return l.Writer == uid
   225  }
   226  
   227  // lock sets uid as a holder of a typed lock on Lock.
   228  //
   229  // Preconditions: canLock is true for the range containing this Lock.
   230  func (l *Lock) lock(uid UniqueID, ownerPID int32, t LockType) {
   231  	switch t {
   232  	case ReadLock:
   233  		// If we are already a reader, then this is a no-op.
   234  		if _, ok := l.Readers[uid]; ok {
   235  			return
   236  		}
   237  		// We cannot downgrade a write lock to a read lock unless the
   238  		// uid is the same.
   239  		if l.Writer != nil {
   240  			if l.Writer != uid {
   241  				panic(fmt.Sprintf("lock: cannot downgrade write lock to read lock for uid %d, writer is %d", uid, l.Writer))
   242  			}
   243  			// Ensure that there is only one reader if upgrading.
   244  			l.Readers = make(map[UniqueID]OwnerInfo)
   245  			// Ensure that there is no longer a writer.
   246  			l.Writer = nil
   247  		}
   248  		l.Readers[uid] = OwnerInfo{PID: ownerPID}
   249  		return
   250  	case WriteLock:
   251  		// If we are already the writer, then this is a no-op.
   252  		if l.Writer == uid {
   253  			return
   254  		}
   255  		// We can only upgrade a read lock to a write lock if there
   256  		// is only one reader and that reader has the same uid as
   257  		// the write lock.
   258  		if readers := len(l.Readers); readers > 0 {
   259  			if readers != 1 {
   260  				panic(fmt.Sprintf("lock: cannot upgrade read lock to write lock for uid %d, too many readers %v", uid, l.Readers))
   261  			}
   262  			if _, ok := l.Readers[uid]; !ok {
   263  				panic(fmt.Sprintf("lock: cannot upgrade read lock to write lock for uid %d, conflicting reader %v", uid, l.Readers))
   264  			}
   265  		}
   266  		// Ensure that there is only a writer.
   267  		l.Readers = make(map[UniqueID]OwnerInfo)
   268  		l.Writer = uid
   269  		l.WriterInfo = OwnerInfo{PID: ownerPID}
   270  	default:
   271  		panic(fmt.Sprintf("lock: invalid lock type %d", t))
   272  	}
   273  }
   274  
   275  // lockable returns true if check returns true for every Lock in LockRange.
   276  // Further, check should return true if Lock meets the callers requirements
   277  // for locking Lock.
   278  func (l LockSet) lockable(r LockRange, check func(value Lock) bool) bool {
   279  	// Get our starting point.
   280  	seg := l.LowerBoundSegment(r.Start)
   281  	for seg.Ok() && seg.Start() < r.End {
   282  		// Note that we don't care about overrunning the end of the
   283  		// last segment because if everything checks out we'll just
   284  		// split the last segment.
   285  		if !check(seg.Value()) {
   286  			return false
   287  		}
   288  		// Jump to the next segment, ignoring gaps, for the same
   289  		// reason we ignored the first gap.
   290  		seg = seg.NextSegment()
   291  	}
   292  	// No conflict, we can get a lock for uid over the entire range.
   293  	return true
   294  }
   295  
   296  // canLock returns true if uid will be able to take a Lock of type t on the
   297  // entire range specified by LockRange.
   298  func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool {
   299  	switch t {
   300  	case ReadLock:
   301  		return l.lockable(r, func(value Lock) bool {
   302  			// If there is no writer, there's no problem adding another reader.
   303  			if value.Writer == nil {
   304  				return true
   305  			}
   306  			// If there is a writer, then it must be the same uid
   307  			// in order to downgrade the lock to a read lock.
   308  			return value.Writer == uid
   309  		})
   310  	case WriteLock:
   311  		return l.lockable(r, func(value Lock) bool {
   312  			// If there are only readers.
   313  			if value.Writer == nil {
   314  				// Then this uid can only take a write lock if this is a private
   315  				// upgrade, meaning that the only reader is uid.
   316  				return value.isOnlyReader(uid)
   317  			}
   318  			// If the uid is already a writer on this region, then
   319  			// adding a write lock would be a no-op.
   320  			return value.Writer == uid
   321  		})
   322  	default:
   323  		panic(fmt.Sprintf("canLock: invalid lock type %d", t))
   324  	}
   325  }
   326  
   327  func (l *Lock) isOnlyReader(uid UniqueID) bool {
   328  	if len(l.Readers) != 1 {
   329  		return false
   330  	}
   331  	_, ok := l.Readers[uid]
   332  	return ok
   333  }
   334  
   335  // lock returns true if uid took a lock of type t on the entire range of
   336  // LockRange.
   337  //
   338  // Preconditions: r.Start <= r.End (will panic otherwise).
   339  func (l *LockSet) lock(uid UniqueID, ownerPID int32, t LockType, r LockRange) bool {
   340  	if r.Start > r.End {
   341  		panic(fmt.Sprintf("lock: r.Start %d > r.End %d", r.Start, r.End))
   342  	}
   343  
   344  	// Don't attempt to insert anything with a range of 0 and treat this
   345  	// as a successful no-op.
   346  	if r.Length() == 0 {
   347  		return true
   348  	}
   349  
   350  	// Do a first-pass check.  We *could* hold onto the segments we
   351  	// checked if canLock would return true, but traversing the segment
   352  	// set should be fast and this keeps things simple.
   353  	if !l.canLock(uid, t, r) {
   354  		return false
   355  	}
   356  	// Get our starting point.
   357  	seg, gap := l.Find(r.Start)
   358  	if gap.Ok() {
   359  		// Fill in the gap and get the next segment to modify.
   360  		seg = l.Insert(gap, gap.Range().Intersect(r), makeLock(uid, ownerPID, t)).NextSegment()
   361  	} else if seg.Start() < r.Start {
   362  		// Get our first segment to modify.
   363  		_, seg = l.Split(seg, r.Start)
   364  	}
   365  	for seg.Ok() && seg.Start() < r.End {
   366  		// Split the last one if necessary.
   367  		if seg.End() > r.End {
   368  			seg, _ = l.SplitUnchecked(seg, r.End)
   369  		}
   370  
   371  		// Set the lock on the segment. This is guaranteed to
   372  		// always be safe, given canLock above.
   373  		value := seg.ValuePtr()
   374  		value.lock(uid, ownerPID, t)
   375  
   376  		// Fill subsequent gaps.
   377  		gap = seg.NextGap()
   378  		if gr := gap.Range().Intersect(r); gr.Length() > 0 {
   379  			seg = l.Insert(gap, gr, makeLock(uid, ownerPID, t)).NextSegment()
   380  		} else {
   381  			seg = gap.NextSegment()
   382  		}
   383  	}
   384  	return true
   385  }
   386  
   387  // unlock is always successful.  If uid has no locks held for the range LockRange,
   388  // unlock is a no-op.
   389  //
   390  // Preconditions: same as lock.
   391  func (l *LockSet) unlock(uid UniqueID, r LockRange) {
   392  	if r.Start > r.End {
   393  		panic(fmt.Sprintf("unlock: r.Start %d > r.End %d", r.Start, r.End))
   394  	}
   395  
   396  	// Same as setlock.
   397  	if r.Length() == 0 {
   398  		return
   399  	}
   400  
   401  	// Get our starting point.
   402  	seg := l.LowerBoundSegment(r.Start)
   403  	for seg.Ok() && seg.Start() < r.End {
   404  		// If this segment doesn't have a lock from uid then
   405  		// there is no need to fragment the set with Isolate (below).
   406  		// In this case just move on to the next segment.
   407  		if !seg.Value().isHeld(uid) {
   408  			seg = seg.NextSegment()
   409  			continue
   410  		}
   411  
   412  		// Ensure that if we need to unlock a sub-segment that
   413  		// we don't unlock/remove that entire segment.
   414  		seg = l.Isolate(seg, r)
   415  
   416  		value := seg.Value()
   417  		var remove bool
   418  		if value.Writer == uid {
   419  			// If we are unlocking a writer, then since there can
   420  			// only ever be one writer and no readers, then this
   421  			// lock should always be removed from the set.
   422  			remove = true
   423  		} else if _, ok := value.Readers[uid]; ok {
   424  			// If uid is the last reader, then just remove the entire
   425  			// segment.
   426  			if len(value.Readers) == 1 {
   427  				remove = true
   428  			} else {
   429  				// Otherwise we need to remove this reader without
   430  				// affecting any other segment's readers.  To do
   431  				// this, we need to make a copy of the Readers map
   432  				// and not add this uid.
   433  				newValue := Lock{Readers: make(map[UniqueID]OwnerInfo)}
   434  				for k, v := range value.Readers {
   435  					if k != uid {
   436  						newValue.Readers[k] = v
   437  					}
   438  				}
   439  				seg.SetValue(newValue)
   440  			}
   441  		}
   442  		if remove {
   443  			seg = l.Remove(seg).NextSegment()
   444  		} else {
   445  			seg = seg.NextSegment()
   446  		}
   447  	}
   448  }
   449  
   450  // ComputeRange takes a positive file offset and computes the start of a LockRange
   451  // using start (relative to offset) and the end of the LockRange using length. The
   452  // values of start and length may be negative but the resulting LockRange must
   453  // preserve that LockRange.Start < LockRange.End and LockRange.Start > 0.
   454  func ComputeRange(start, length, offset int64) (LockRange, error) {
   455  	offset += start
   456  	// fcntl(2): "l_start can be a negative number provided the offset
   457  	// does not lie before the start of the file"
   458  	if offset < 0 {
   459  		return LockRange{}, unix.EINVAL
   460  	}
   461  
   462  	// fcntl(2): Specifying 0 for l_len has the  special meaning: lock all
   463  	// bytes starting at the location specified by l_whence and l_start
   464  	// through to the end of file, no matter how large the file grows.
   465  	end := uint64(LockEOF)
   466  	if length > 0 {
   467  		// fcntl(2): If l_len is positive, then the range to be locked
   468  		// covers bytes l_start up to and including l_start+l_len-1.
   469  		//
   470  		// Since LockRange.End is exclusive we need not -1 from length..
   471  		end = uint64(offset + length)
   472  	} else if length < 0 {
   473  		// fcntl(2): If l_len is negative, the interval described by
   474  		// lock covers bytes l_start+l_len up to and including l_start-1.
   475  		//
   476  		// Since LockRange.End is exclusive we need not -1 from offset.
   477  		signedEnd := offset
   478  		// Add to offset using a negative length (subtract).
   479  		offset += length
   480  		if offset < 0 {
   481  			return LockRange{}, unix.EINVAL
   482  		}
   483  		if signedEnd < offset {
   484  			return LockRange{}, unix.EOVERFLOW
   485  		}
   486  		// At this point signedEnd cannot be negative,
   487  		// since we asserted that offset is not negative
   488  		// and it is not less than offset.
   489  		end = uint64(signedEnd)
   490  	}
   491  	// Offset is guaranteed to be positive at this point.
   492  	return LockRange{Start: uint64(offset), End: end}, nil
   493  }
   494  
   495  // TestRegion checks whether the lock holder identified by uid can hold a lock
   496  // of type t on range r. It returns a Flock struct representing this
   497  // information as the F_GETLK fcntl does.
   498  //
   499  // Note that the PID returned in the flock structure is relative to the root PID
   500  // namespace. It needs to be converted to the caller's PID namespace before
   501  // returning to userspace.
   502  //
   503  // TODO(github.com/SagerNet/issue/5264): we don't support OFD locks through fcntl, which
   504  // would return a struct with pid = -1.
   505  func (l *Locks) TestRegion(ctx context.Context, uid UniqueID, t LockType, r LockRange) linux.Flock {
   506  	f := linux.Flock{Type: linux.F_UNLCK}
   507  	switch t {
   508  	case ReadLock:
   509  		l.testRegion(r, func(lock Lock, start, length uint64) bool {
   510  			if lock.Writer == nil || lock.Writer == uid {
   511  				return true
   512  			}
   513  			f.Type = linux.F_WRLCK
   514  			f.PID = lock.WriterInfo.PID
   515  			f.Start = int64(start)
   516  			f.Len = int64(length)
   517  			return false
   518  		})
   519  	case WriteLock:
   520  		l.testRegion(r, func(lock Lock, start, length uint64) bool {
   521  			if lock.Writer == nil {
   522  				for k, v := range lock.Readers {
   523  					if k != uid {
   524  						// Stop at the first conflict detected.
   525  						f.Type = linux.F_RDLCK
   526  						f.PID = v.PID
   527  						f.Start = int64(start)
   528  						f.Len = int64(length)
   529  						return false
   530  					}
   531  				}
   532  				return true
   533  			}
   534  			if lock.Writer == uid {
   535  				return true
   536  			}
   537  			f.Type = linux.F_WRLCK
   538  			f.PID = lock.WriterInfo.PID
   539  			f.Start = int64(start)
   540  			f.Len = int64(length)
   541  			return false
   542  		})
   543  	default:
   544  		panic(fmt.Sprintf("TestRegion: invalid lock type %d", t))
   545  	}
   546  	return f
   547  }
   548  
   549  func (l *Locks) testRegion(r LockRange, check func(lock Lock, start, length uint64) bool) {
   550  	l.mu.Lock()
   551  	defer l.mu.Unlock()
   552  
   553  	seg := l.locks.LowerBoundSegment(r.Start)
   554  	for seg.Ok() && seg.Start() < r.End {
   555  		lock := seg.Value()
   556  		if !check(lock, seg.Start(), seg.End()-seg.Start()) {
   557  			// Stop at the first conflict detected.
   558  			return
   559  		}
   560  		seg = seg.NextSegment()
   561  	}
   562  }