github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/lisafs/node.go (about)

     1  // Copyright 2022 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package lisafs
    16  
    17  import (
    18  	"github.com/metacubex/gvisor/pkg/atomicbitops"
    19  	"github.com/metacubex/gvisor/pkg/context"
    20  	"github.com/metacubex/gvisor/pkg/fspath"
    21  	"github.com/metacubex/gvisor/pkg/sync"
    22  )
    23  
    24  // numStaticChildren is the number of static children tracked by each node.
    25  // Sampling certain filesystem heavy workloads showed that a majority of
    26  // directories store at most 5 children in their map. This should be kept low
    27  // to minimize the memory overhead for each node. 5 is fairly low and at the
    28  // same time helps avoid map allocations for majority of nodes. Benchmarking
    29  // also showed that static arrays are faster than maps for lookups until n=8.
    30  const numStaticChildren = 5
    31  
    32  // Node is a node on the filesystem tree. A Node is shared by all the
    33  // ControlFDs opened on that position. For a given Server, there will only be
    34  // one Node for a given filesystem position.
    35  //
    36  // Reference Model:
    37  //   - Each node holds a ref on its parent for its entire lifetime.
    38  type Node struct {
    39  	// node's ref count is protected by its parent's childrenMu.
    40  	nodeRefs
    41  
    42  	// opMu synchronizes high level operations on this path.
    43  	//
    44  	// It is used to ensure the following which are important for security:
    45  	//	* This node's data is protected by opMu. So all operations that change its
    46  	//   data should hold opMu for writing. For example: write, setstat, setxattr,
    47  	//   etc. This entails that if this node represents a directory, creation and
    48  	//   deletion operations happening directly under this directory must lock
    49  	//   opMu for writing. All operations accessing data must hold opMu for
    50  	//   reading. This is to avoid the can of worms that open when creation and
    51  	//   deletion are allowed to race. This prevents any walks from occurring
    52  	//   during creation or deletion.
    53  	//	* When this node is being deleted, the deletion handler must hold opMu for
    54  	//   writing. This ensures that there are no concurrent operations going on
    55  	//   this node while it is being deleted and potentially being replaced with
    56  	//   something hazardous.
    57  	//
    58  	// A useful consequence of the above is that holding opMu for reading
    59  	// guarantees that the Server can not change Nodes on the path until this
    60  	// Node. For instance, if the grandparent needs to be renamed or deleted,
    61  	// the client must first delete this node to avoid ENOTEMPTY error. Deleting
    62  	// this node is not possible while opMu is read locked.
    63  	opMu sync.RWMutex
    64  
    65  	// deleted indicates whether the backing file has been unlinked. This can be
    66  	// used to deny operations on FDs on this Node after deletion because it is
    67  	// not safe for FD implementations to do host walks up to this position
    68  	// anymore. This node may have been replaced with something hazardous.
    69  	// deleted is protected by opMu. deleted must only be accessed/mutated using
    70  	// atomics; see markDeletedRecursive for more details.
    71  	deleted atomicbitops.Uint32
    72  
    73  	// name is the name of the file represented by this Node in parent. If this
    74  	// FD represents the root directory, then name is an empty string. name is
    75  	// protected by the backing server's rename mutex.
    76  	name string
    77  
    78  	// parent is this parent node which tracks this node as a child. parent is
    79  	// protected by the backing server's rename mutex.
    80  	parent *Node
    81  
    82  	// controlFDs is a linked list of all the ControlFDs opened on this node.
    83  	// Prefer this over a slice to avoid additional allocations. Each ControlFD
    84  	// is an implicit linked list node so there are no additional allocations
    85  	// needed to maintain the linked list.
    86  	controlFDsMu sync.Mutex
    87  	controlFDs   controlFDList
    88  
    89  	// Here is a performance hack. Past experience has shown that map allocations
    90  	// on each node for tracking children costs a lot of memory. More small
    91  	// allocations also fragment memory. To save allocations, statically track
    92  	// upto numStaticChildren children using hardcoded pointers. If more children
    93  	// are inserted then move to a map. Use dynamicChildren iff it is non-nil.
    94  
    95  	// The following fields are protected by childrenMu.
    96  	childrenMu     sync.Mutex
    97  	staticChildren [numStaticChildren]struct {
    98  		name string
    99  		node *Node
   100  	}
   101  	dynamicChildren map[string]*Node
   102  }
   103  
   104  // DecRef implements refs.RefCounter.DecRef. Note that the context
   105  // parameter should never be used. It exists solely to comply with the
   106  // refs.RefCounter interface.
   107  //
   108  // Precondition: server's rename mutex must be at least read locked.
   109  func (n *Node) DecRef(context.Context) {
   110  	if n.parent == nil {
   111  		n.nodeRefs.DecRef(nil)
   112  		return
   113  	}
   114  	// If this is the only ref on node then it will need to be destroyed.
   115  	n.parent.childrenMu.Lock()
   116  	deleted := false
   117  	n.nodeRefs.DecRef(func() {
   118  		n.parent.removeChildLocked(n.name)
   119  		deleted = true
   120  	})
   121  	n.parent.childrenMu.Unlock()
   122  	if deleted {
   123  		// Drop ref on parent. Keep Decref call lock free for scalability.
   124  		n.parent.DecRef(nil)
   125  	}
   126  }
   127  
   128  // InitLocked must be called before first use of fd.
   129  //
   130  // Precondition: parent.childrenMu is locked.
   131  //
   132  // Postconditions: A ref on n is transferred to the caller.
   133  func (n *Node) InitLocked(name string, parent *Node) {
   134  	n.nodeRefs.InitRefs()
   135  	n.name = name
   136  	n.parent = parent
   137  	if parent != nil {
   138  		parent.IncRef()
   139  		parent.insertChildLocked(name, n)
   140  	}
   141  }
   142  
   143  // LookupChildLocked looks up for a child with given name. Returns nil if child
   144  // does not exist.
   145  //
   146  // Preconditions: childrenMu is locked.
   147  func (n *Node) LookupChildLocked(name string) *Node {
   148  	if n.dynamicChildren != nil {
   149  		return n.dynamicChildren[name]
   150  	}
   151  
   152  	for i := 0; i < numStaticChildren; i++ {
   153  		if n.staticChildren[i].name == name {
   154  			return n.staticChildren[i].node
   155  		}
   156  	}
   157  	return nil
   158  }
   159  
   160  // WithChildrenMu executes fn with n.childrenMu locked.
   161  func (n *Node) WithChildrenMu(fn func()) {
   162  	n.childrenMu.Lock()
   163  	defer n.childrenMu.Unlock()
   164  	fn()
   165  }
   166  
   167  // FilePath returns the absolute path of the backing file. This is an expensive
   168  // operation. The returned path should be free of any intermediate symlinks
   169  // because all internal (non-leaf) nodes are directories.
   170  //
   171  // Precondition:
   172  //   - server's rename mutex must be at least read locked. Calling handlers must
   173  //     at least have read concurrency guarantee from the server.
   174  func (n *Node) FilePath() string {
   175  	// Walk upwards and prepend name to res.
   176  	var res fspath.Builder
   177  	for n.parent != nil {
   178  		res.PrependComponent(n.name)
   179  		n = n.parent
   180  	}
   181  	// n is the root node.
   182  	res.PrependByte('/')
   183  	return res.String()
   184  }
   185  
   186  func (n *Node) isDeleted() bool {
   187  	return n.deleted.Load() != 0
   188  }
   189  
   190  func (n *Node) removeFD(fd *ControlFD) {
   191  	n.controlFDsMu.Lock()
   192  	defer n.controlFDsMu.Unlock()
   193  	n.controlFDs.Remove(fd)
   194  }
   195  
   196  func (n *Node) insertFD(fd *ControlFD) {
   197  	n.controlFDsMu.Lock()
   198  	defer n.controlFDsMu.Unlock()
   199  	n.controlFDs.PushBack(fd)
   200  }
   201  
   202  func (n *Node) forEachFD(fn func(*ControlFD)) {
   203  	n.controlFDsMu.Lock()
   204  	defer n.controlFDsMu.Unlock()
   205  	for fd := n.controlFDs.Front(); fd != nil; fd = fd.Next() {
   206  		fn(fd)
   207  	}
   208  }
   209  
   210  // removeChildLocked removes child with given name from n and returns the
   211  // removed child. Returns nil if no such child existed.
   212  //
   213  // Precondition: childrenMu is locked.
   214  func (n *Node) removeChildLocked(name string) *Node {
   215  	if n.dynamicChildren != nil {
   216  		toRemove := n.dynamicChildren[name]
   217  		delete(n.dynamicChildren, name)
   218  		return toRemove
   219  	}
   220  
   221  	for i := 0; i < numStaticChildren; i++ {
   222  		if n.staticChildren[i].name == name {
   223  			toRemove := n.staticChildren[i].node
   224  			n.staticChildren[i].name = ""
   225  			n.staticChildren[i].node = nil
   226  			return toRemove
   227  		}
   228  	}
   229  	return nil
   230  }
   231  
   232  // insertChildLocked inserts child into n. It does not check for duplicates.
   233  //
   234  // Precondition: childrenMu is locked.
   235  func (n *Node) insertChildLocked(name string, child *Node) {
   236  	// Try to insert statically first if staticChildren is still being used.
   237  	if n.dynamicChildren == nil {
   238  		for i := 0; i < numStaticChildren; i++ {
   239  			if n.staticChildren[i].node == nil {
   240  				n.staticChildren[i].node = child
   241  				n.staticChildren[i].name = name
   242  				return
   243  			}
   244  		}
   245  
   246  		// Ran out of static space. Need to start inserting dynamically.
   247  		// Shift everything to the map.
   248  		n.dynamicChildren = make(map[string]*Node)
   249  		for i := 0; i < numStaticChildren; i++ {
   250  			// From above loop we know all staticChildren entries are non-nil.
   251  			n.dynamicChildren[n.staticChildren[i].name] = n.staticChildren[i].node
   252  			n.staticChildren[i].name = ""
   253  			n.staticChildren[i].node = nil
   254  		}
   255  	}
   256  
   257  	n.dynamicChildren[name] = child
   258  }
   259  
   260  func (n *Node) forEachChild(fn func(*Node)) {
   261  	n.childrenMu.Lock()
   262  	defer n.childrenMu.Unlock()
   263  
   264  	if n.dynamicChildren != nil {
   265  		for _, child := range n.dynamicChildren {
   266  			fn(child)
   267  		}
   268  		return
   269  	}
   270  
   271  	for i := 0; i < numStaticChildren; i++ {
   272  		if n.staticChildren[i].node != nil {
   273  			fn(n.staticChildren[i].node)
   274  		}
   275  	}
   276  }
   277  
   278  // Precondition: opMu must be locked for writing on the root node being marked
   279  // as deleted.
   280  func (n *Node) markDeletedRecursive() {
   281  	n.deleted.Store(1)
   282  
   283  	// No need to hold opMu for children as it introduces lock ordering issues
   284  	// because forEachChild locks childrenMu. Locking opMu after childrenMu
   285  	// violates the lock ordering. Anyway if a directory is being deleted, it
   286  	// must not have children. The client must have already deleted the entire
   287  	// subtree. If the client did not delete this subtree nodes, then the subtree
   288  	// was deleted externally and there is not much we can do. This is best
   289  	// effort work to mark the subtree as deleted.
   290  	n.forEachChild(func(child *Node) {
   291  		child.markDeletedRecursive()
   292  	})
   293  }