github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/fsimpl/lock/lock.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package lock is the API for POSIX-style advisory regional file locks and
    16  // BSD-style full file locks.
    17  //
    18  // Callers needing to enforce these types of locks, like sys_fcntl, can call
    19  // LockRegion and UnlockRegion on a thread-safe set of Locks.  Locks are
    20  // specific to a unique file (unique device/inode pair) and for this reason
    21  // should not be shared between files.
    22  //
    23  // A Lock has a set of holders identified by UniqueID.  Normally this is the
    24  // pid of the thread attempting to acquire the lock.
    25  //
    26  // Since these are advisory locks, they do not need to be integrated into
    27  // Reads/Writes and for this reason there is no way to *check* if a lock is
    28  // held.  One can only attempt to take a lock or unlock an existing lock.
    29  //
    30  // A Lock in a set of Locks is typed: it is either a read lock with any number
    31  // of readers and no writer, or a write lock with no readers.
    32  //
    33  // As expected from POSIX, any attempt to acquire a write lock on a file region
    34  // when there already exits a write lock held by a different uid will fail. Any
    35  // attempt to acquire a write lock on a file region when there is more than one
    36  // reader will fail.  Any attempt to acquire a read lock on a file region when
    37  // there is already a writer will fail.
    38  //
    39  // In special cases, a read lock may be upgraded to a write lock and a write lock
    40  // can be downgraded to a read lock.  This can only happen if:
    41  //
    42  //   - read lock upgrade to write lock: There can be only one reader and the reader
    43  //     must be the same as the requested write lock holder.
    44  //
    45  //   - write lock downgrade to read lock: The writer must be the same as the requested
    46  //     read lock holder.
    47  //
    48  // UnlockRegion always succeeds.  If LockRegion fails the caller should normally
    49  // interpret this as "try again later".
    50  package lock
    51  
    52  import (
    53  	"fmt"
    54  	"math"
    55  
    56  	"golang.org/x/sys/unix"
    57  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    58  	"github.com/nicocha30/gvisor-ligolo/pkg/context"
    59  	"github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr"
    60  	"github.com/nicocha30/gvisor-ligolo/pkg/sync"
    61  	"github.com/nicocha30/gvisor-ligolo/pkg/waiter"
    62  )
    63  
    64  // LockType is a type of regional file lock.
    65  type LockType int
    66  
    67  // UniqueID is a unique identifier of the holder of a regional file lock.
    68  type UniqueID any
    69  
    70  const (
    71  	// ReadLock describes a POSIX regional file lock to be taken
    72  	// read only.  There may be multiple of these locks on a single
    73  	// file region as long as there is no writer lock on the same
    74  	// region.
    75  	ReadLock LockType = iota
    76  
    77  	// WriteLock describes a POSIX regional file lock to be taken
    78  	// write only.  There may be only a single holder of this lock
    79  	// and no read locks.
    80  	WriteLock
    81  )
    82  
    83  // LockEOF is the maximal possible end of a regional file lock.
    84  //
    85  // A BSD-style full file lock can be represented as a regional file lock from
    86  // offset 0 to LockEOF.
    87  const LockEOF = math.MaxInt64
    88  
    89  // OwnerInfo describes the owner of a lock.
    90  //
    91  // +stateify savable
    92  type OwnerInfo struct {
    93  	// PID is the process ID of the lock owner.
    94  	PID int32
    95  	// OFD is whether this is an open file descriptor lock.
    96  	OFD bool
    97  }
    98  
    99  // Lock is a regional file lock.  It consists of either a single writer
   100  // or a set of readers.
   101  //
   102  // A Lock may be upgraded from a read lock to a write lock only if there
   103  // is a single reader and that reader has the same uid as the write lock.
   104  //
   105  // A Lock may be downgraded from a write lock to a read lock only if
   106  // the write lock's uid is the same as the read lock.
   107  //
   108  // Accesses to Lock are synchronized through the Locks object to which it
   109  // belongs.
   110  //
   111  // +stateify savable
   112  type Lock struct {
   113  	// Readers are the set of read lock holders identified by UniqueID.
   114  	// If len(Readers) > 0 then Writer must be nil.
   115  	Readers map[UniqueID]OwnerInfo
   116  
   117  	// Writer holds the writer unique ID. It's nil if there are no writers.
   118  	Writer UniqueID
   119  
   120  	// WriterInfo describes the writer. It is only meaningful if Writer != nil.
   121  	WriterInfo OwnerInfo
   122  }
   123  
   124  // Locks is a thread-safe wrapper around a LockSet.
   125  //
   126  // +stateify savable
   127  type Locks struct {
   128  	// mu protects locks below.
   129  	mu sync.Mutex `state:"nosave"`
   130  
   131  	// locks is the set of region locks currently held on an Inode.
   132  	locks LockSet
   133  
   134  	// blockedQueue is the queue of waiters that are waiting on a lock.
   135  	blockedQueue waiter.Queue
   136  }
   137  
   138  // LockRegion attempts to acquire a typed lock for the uid on a region of a
   139  // file. Returns nil if successful in locking the region, otherwise an
   140  // appropriate error is returned.
   141  func (l *Locks) LockRegion(ctx context.Context, uid UniqueID, ownerPID int32, t LockType, r LockRange, ofd bool, block bool) error {
   142  	l.mu.Lock()
   143  	defer l.mu.Unlock()
   144  	for {
   145  
   146  		// Blocking locks must run in a loop because we'll be woken up whenever an unlock event
   147  		// happens for this lock. We will then attempt to take the lock again and if it fails
   148  		// continue blocking.
   149  		err := l.locks.lock(uid, ownerPID, t, r, ofd)
   150  		if err == linuxerr.ErrWouldBlock && block {
   151  			// Note: we release the lock in EventRegister below, in
   152  			// order to avoid a possible race.
   153  			ok := ctx.BlockOn(l, waiter.EventIn)
   154  			l.mu.Lock() // +checklocksforce: see above.
   155  			if ok {
   156  				continue // Try again now that someone has unlocked.
   157  			}
   158  			// Must be interrupted.
   159  			return linuxerr.ErrInterrupted
   160  		}
   161  
   162  		return err
   163  	}
   164  }
   165  
   166  // Readiness always returns zero.
   167  func (l *Locks) Readiness(waiter.EventMask) waiter.EventMask {
   168  	return 0
   169  }
   170  
   171  // EventRegister implements waiter.Waitable.EventRegister.
   172  func (l *Locks) EventRegister(e *waiter.Entry) error {
   173  	defer l.mu.Unlock() // +checklocksforce: see above.
   174  	l.blockedQueue.EventRegister(e)
   175  	return nil
   176  }
   177  
   178  // EventUnregister implements waiter.Waitable.EventUnregister.
   179  func (l *Locks) EventUnregister(e *waiter.Entry) {
   180  	l.blockedQueue.EventUnregister(e)
   181  }
   182  
   183  // UnlockRegion attempts to release a lock for the uid on a region of a file.
   184  // This operation is always successful, even if there did not exist a lock on
   185  // the requested region held by uid in the first place.
   186  func (l *Locks) UnlockRegion(uid UniqueID, r LockRange) {
   187  	l.mu.Lock()
   188  	defer l.mu.Unlock()
   189  	l.locks.unlock(uid, r)
   190  
   191  	// Now that we've released the lock, we need to wake up any waiters.
   192  	// We track how many notifications have happened since the last attempt
   193  	// to acquire the lock, in order to ensure that we avoid races.
   194  	l.blockedQueue.Notify(waiter.EventIn)
   195  }
   196  
   197  // makeLock returns a new typed Lock that has either uid as its only reader
   198  // or uid as its only writer.
   199  func makeLock(uid UniqueID, ownerPID int32, t LockType, ofd bool) Lock {
   200  	value := Lock{Readers: make(map[UniqueID]OwnerInfo)}
   201  	switch t {
   202  	case ReadLock:
   203  		value.Readers[uid] = OwnerInfo{PID: ownerPID, OFD: ofd}
   204  	case WriteLock:
   205  		value.Writer = uid
   206  		value.WriterInfo = OwnerInfo{PID: ownerPID, OFD: ofd}
   207  	default:
   208  		panic(fmt.Sprintf("makeLock: invalid lock type %d", t))
   209  	}
   210  	return value
   211  }
   212  
   213  // isHeld returns true if uid is a holder of Lock.
   214  func (l Lock) isHeld(uid UniqueID) bool {
   215  	if _, ok := l.Readers[uid]; ok {
   216  		return true
   217  	}
   218  	return l.Writer == uid
   219  }
   220  
   221  // lock sets uid as a holder of a typed lock on Lock.
   222  //
   223  // Preconditions: canLock is true for the range containing this Lock.
   224  func (l *Lock) lock(uid UniqueID, ownerPID int32, t LockType, ofd bool) {
   225  	switch t {
   226  	case ReadLock:
   227  		// If we are already a reader, then this is a no-op.
   228  		if _, ok := l.Readers[uid]; ok {
   229  			return
   230  		}
   231  		// We cannot downgrade a write lock to a read lock unless the
   232  		// uid is the same.
   233  		if l.Writer != nil {
   234  			if l.Writer != uid {
   235  				panic(fmt.Sprintf("lock: cannot downgrade write lock to read lock for uid %d, writer is %d", uid, l.Writer))
   236  			}
   237  			// Ensure that there is only one reader if upgrading.
   238  			l.Readers = make(map[UniqueID]OwnerInfo)
   239  			// Ensure that there is no longer a writer.
   240  			l.Writer = nil
   241  		}
   242  		l.Readers[uid] = OwnerInfo{PID: ownerPID, OFD: ofd}
   243  		return
   244  	case WriteLock:
   245  		// If we are already the writer, then this is a no-op.
   246  		if l.Writer == uid {
   247  			return
   248  		}
   249  		// We can only upgrade a read lock to a write lock if there
   250  		// is only one reader and that reader has the same uid as
   251  		// the write lock.
   252  		if readers := len(l.Readers); readers > 0 {
   253  			if readers != 1 {
   254  				panic(fmt.Sprintf("lock: cannot upgrade read lock to write lock for uid %d, too many readers %v", uid, l.Readers))
   255  			}
   256  			if _, ok := l.Readers[uid]; !ok {
   257  				panic(fmt.Sprintf("lock: cannot upgrade read lock to write lock for uid %d, conflicting reader %v", uid, l.Readers))
   258  			}
   259  		}
   260  		// Ensure that there is only a writer.
   261  		l.Readers = make(map[UniqueID]OwnerInfo)
   262  		l.Writer = uid
   263  		l.WriterInfo = OwnerInfo{PID: ownerPID, OFD: ofd}
   264  	default:
   265  		panic(fmt.Sprintf("lock: invalid lock type %d", t))
   266  	}
   267  }
   268  
   269  // lockable returns true if check returns true for every Lock in LockRange.
   270  // Further, check should return true if Lock meets the callers requirements
   271  // for locking Lock.
   272  func (l LockSet) lockable(r LockRange, check func(value Lock) bool) bool {
   273  	// Get our starting point.
   274  	seg := l.LowerBoundSegment(r.Start)
   275  	for seg.Ok() && seg.Start() < r.End {
   276  		// Note that we don't care about overrunning the end of the
   277  		// last segment because if everything checks out we'll just
   278  		// split the last segment.
   279  		if !check(seg.Value()) {
   280  			return false
   281  		}
   282  		// Jump to the next segment, ignoring gaps, for the same
   283  		// reason we ignored the first gap.
   284  		seg = seg.NextSegment()
   285  	}
   286  	// No conflict, we can get a lock for uid over the entire range.
   287  	return true
   288  }
   289  
   290  // canLock returns true if uid will be able to take a Lock of type t on the
   291  // entire range specified by LockRange.
   292  func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool {
   293  	switch t {
   294  	case ReadLock:
   295  		return l.lockable(r, func(value Lock) bool {
   296  			// If there is no writer, there's no problem adding another reader.
   297  			if value.Writer == nil {
   298  				return true
   299  			}
   300  			// If there is a writer, then it must be the same uid
   301  			// in order to downgrade the lock to a read lock.
   302  			return value.Writer == uid
   303  		})
   304  	case WriteLock:
   305  		return l.lockable(r, func(value Lock) bool {
   306  			// If there are only readers.
   307  			if value.Writer == nil {
   308  				// Then this uid can only take a write lock if this is a private
   309  				// upgrade, meaning that the only reader is uid.
   310  				return value.isOnlyReader(uid)
   311  			}
   312  			// If the uid is already a writer on this region, then
   313  			// adding a write lock would be a no-op.
   314  			return value.Writer == uid
   315  		})
   316  	default:
   317  		panic(fmt.Sprintf("canLock: invalid lock type %d", t))
   318  	}
   319  }
   320  
   321  func (l *Lock) isOnlyReader(uid UniqueID) bool {
   322  	if len(l.Readers) != 1 {
   323  		return false
   324  	}
   325  	_, ok := l.Readers[uid]
   326  	return ok
   327  }
   328  
   329  // lock returns nil if uid took a lock of type t on the entire range of
   330  // LockRange. Otherwise, linuxerr.ErrWouldBlock is returned.
   331  //
   332  // Preconditions: r.Start <= r.End (will panic otherwise).
   333  func (l *LockSet) lock(uid UniqueID, ownerPID int32, t LockType, r LockRange, ofd bool) error {
   334  	if r.Start > r.End {
   335  		panic(fmt.Sprintf("lock: r.Start %d > r.End %d", r.Start, r.End))
   336  	}
   337  
   338  	// Don't attempt to insert anything with a range of 0 and treat this
   339  	// as a successful no-op.
   340  	if r.Length() == 0 {
   341  		return nil
   342  	}
   343  
   344  	// Do a first-pass check. We *could* hold onto the segments we checked
   345  	// if canLock would return true, but traversing the segment set should
   346  	// be fast and this keeps things simple.
   347  	if !l.canLock(uid, t, r) {
   348  		return linuxerr.ErrWouldBlock
   349  	}
   350  
   351  	// Get our starting point.
   352  	seg, gap := l.Find(r.Start)
   353  	if gap.Ok() {
   354  		// Fill in the gap and get the next segment to modify.
   355  		seg = l.Insert(gap, gap.Range().Intersect(r), makeLock(uid, ownerPID, t, ofd)).NextSegment()
   356  	} else if seg.Start() < r.Start {
   357  		// Get our first segment to modify.
   358  		_, seg = l.Split(seg, r.Start)
   359  	}
   360  	for seg.Ok() && seg.Start() < r.End {
   361  		// Split the last one if necessary.
   362  		if seg.End() > r.End {
   363  			seg, _ = l.SplitUnchecked(seg, r.End)
   364  		}
   365  
   366  		// Set the lock on the segment. This is guaranteed to
   367  		// always be safe, given canLock above.
   368  		value := seg.ValuePtr()
   369  		value.lock(uid, ownerPID, t, ofd)
   370  
   371  		// Fill subsequent gaps.
   372  		gap = seg.NextGap()
   373  		if gr := gap.Range().Intersect(r); gr.Length() > 0 {
   374  			seg = l.Insert(gap, gr, makeLock(uid, ownerPID, t, ofd)).NextSegment()
   375  		} else {
   376  			seg = gap.NextSegment()
   377  		}
   378  	}
   379  
   380  	return nil
   381  }
   382  
   383  // unlock is always successful.  If uid has no locks held for the range LockRange,
   384  // unlock is a no-op.
   385  //
   386  // Preconditions: same as lock.
   387  func (l *LockSet) unlock(uid UniqueID, r LockRange) {
   388  	if r.Start > r.End {
   389  		panic(fmt.Sprintf("unlock: r.Start %d > r.End %d", r.Start, r.End))
   390  	}
   391  
   392  	// Same as setlock.
   393  	if r.Length() == 0 {
   394  		return
   395  	}
   396  
   397  	// Get our starting point.
   398  	seg := l.LowerBoundSegment(r.Start)
   399  	for seg.Ok() && seg.Start() < r.End {
   400  		// If this segment doesn't have a lock from uid then
   401  		// there is no need to fragment the set with Isolate (below).
   402  		// In this case just move on to the next segment.
   403  		if !seg.Value().isHeld(uid) {
   404  			seg = seg.NextSegment()
   405  			continue
   406  		}
   407  
   408  		// Ensure that if we need to unlock a sub-segment that
   409  		// we don't unlock/remove that entire segment.
   410  		seg = l.Isolate(seg, r)
   411  
   412  		value := seg.Value()
   413  		var remove bool
   414  		if value.Writer == uid {
   415  			// If we are unlocking a writer, then since there can
   416  			// only ever be one writer and no readers, then this
   417  			// lock should always be removed from the set.
   418  			remove = true
   419  		} else if _, ok := value.Readers[uid]; ok {
   420  			// If uid is the last reader, then just remove the entire
   421  			// segment.
   422  			if len(value.Readers) == 1 {
   423  				remove = true
   424  			} else {
   425  				// Otherwise we need to remove this reader without
   426  				// affecting any other segment's readers.  To do
   427  				// this, we need to make a copy of the Readers map
   428  				// and not add this uid.
   429  				newValue := Lock{Readers: make(map[UniqueID]OwnerInfo)}
   430  				for k, v := range value.Readers {
   431  					if k != uid {
   432  						newValue.Readers[k] = v
   433  					}
   434  				}
   435  				seg.SetValue(newValue)
   436  			}
   437  		}
   438  		if remove {
   439  			seg = l.Remove(seg).NextSegment()
   440  		} else {
   441  			seg = seg.NextSegment()
   442  		}
   443  	}
   444  }
   445  
   446  // ComputeRange takes a positive file offset and computes the start of a LockRange
   447  // using start (relative to offset) and the end of the LockRange using length. The
   448  // values of start and length may be negative but the resulting LockRange must
   449  // preserve that LockRange.Start < LockRange.End and LockRange.Start > 0.
   450  func ComputeRange(start, length, offset int64) (LockRange, error) {
   451  	offset += start
   452  	// fcntl(2): "l_start can be a negative number provided the offset
   453  	// does not lie before the start of the file"
   454  	if offset < 0 {
   455  		return LockRange{}, unix.EINVAL
   456  	}
   457  
   458  	// fcntl(2): Specifying 0 for l_len has the  special meaning: lock all
   459  	// bytes starting at the location specified by l_whence and l_start
   460  	// through to the end of file, no matter how large the file grows.
   461  	end := uint64(LockEOF)
   462  	if length > 0 {
   463  		// fcntl(2): If l_len is positive, then the range to be locked
   464  		// covers bytes l_start up to and including l_start+l_len-1.
   465  		//
   466  		// Since LockRange.End is exclusive we need not -1 from length..
   467  		end = uint64(offset + length)
   468  	} else if length < 0 {
   469  		// fcntl(2): If l_len is negative, the interval described by
   470  		// lock covers bytes l_start+l_len up to and including l_start-1.
   471  		//
   472  		// Since LockRange.End is exclusive we need not -1 from offset.
   473  		signedEnd := offset
   474  		// Add to offset using a negative length (subtract).
   475  		offset += length
   476  		if offset < 0 {
   477  			return LockRange{}, unix.EINVAL
   478  		}
   479  		if signedEnd < offset {
   480  			return LockRange{}, unix.EOVERFLOW
   481  		}
   482  		// At this point signedEnd cannot be negative,
   483  		// since we asserted that offset is not negative
   484  		// and it is not less than offset.
   485  		end = uint64(signedEnd)
   486  	}
   487  	// Offset is guaranteed to be positive at this point.
   488  	return LockRange{Start: uint64(offset), End: end}, nil
   489  }
   490  
   491  // TestRegion checks whether the lock holder identified by uid can hold a lock
   492  // of type t on range r. It returns a Flock struct representing this
   493  // information as the F_GETLK fcntl does.
   494  //
   495  // Note that the PID returned in the flock structure is relative to the root PID
   496  // namespace. It needs to be converted to the caller's PID namespace before
   497  // returning to userspace.
   498  func (l *Locks) TestRegion(ctx context.Context, uid UniqueID, t LockType, r LockRange, ofd bool) linux.Flock {
   499  	f := linux.Flock{Type: linux.F_UNLCK}
   500  	switch t {
   501  	case ReadLock:
   502  		l.testRegion(r, func(lock Lock, start, length uint64) bool {
   503  			if lock.Writer == nil || lock.Writer == uid {
   504  				return true
   505  			}
   506  			f.Type = linux.F_WRLCK
   507  			f.PID = lock.WriterInfo.PID
   508  			f.Start = int64(start)
   509  			f.Len = int64(length)
   510  			return false
   511  		})
   512  	case WriteLock:
   513  		l.testRegion(r, func(lock Lock, start, length uint64) bool {
   514  			if lock.Writer == nil {
   515  				for k, v := range lock.Readers {
   516  					if k != uid && v.OFD == ofd {
   517  						// Stop at the first conflict detected.
   518  						f.Type = linux.F_RDLCK
   519  						f.PID = v.PID
   520  						f.Start = int64(start)
   521  						f.Len = int64(length)
   522  						return false
   523  					}
   524  				}
   525  				return true
   526  			}
   527  			if lock.Writer == uid {
   528  				return true
   529  			}
   530  			f.Type = linux.F_WRLCK
   531  			f.PID = lock.WriterInfo.PID
   532  			f.Start = int64(start)
   533  			f.Len = int64(length)
   534  			return false
   535  		})
   536  	default:
   537  		panic(fmt.Sprintf("TestRegion: invalid lock type %d", t))
   538  	}
   539  	return f
   540  }
   541  
   542  func (l *Locks) testRegion(r LockRange, check func(lock Lock, start, length uint64) bool) {
   543  	l.mu.Lock()
   544  	defer l.mu.Unlock()
   545  
   546  	seg := l.locks.LowerBoundSegment(r.Start)
   547  	for seg.Ok() && seg.Start() < r.End {
   548  		lock := seg.Value()
   549  		if !check(lock, seg.Start(), seg.End()-seg.Start()) {
   550  			// Stop at the first conflict detected.
   551  			return
   552  		}
   553  		seg = seg.NextSegment()
   554  	}
   555  }