gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/lisafs/node.go (about) 1 // Copyright 2022 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package lisafs 16 17 import ( 18 "gvisor.dev/gvisor/pkg/atomicbitops" 19 "gvisor.dev/gvisor/pkg/context" 20 "gvisor.dev/gvisor/pkg/fspath" 21 "gvisor.dev/gvisor/pkg/sync" 22 ) 23 24 // numStaticChildren is the number of static children tracked by each node. 25 // Sampling certain filesystem heavy workloads showed that a majority of 26 // directories store at most 5 children in their map. This should be kept low 27 // to minimize the memory overhead for each node. 5 is fairly low and at the 28 // same time helps avoid map allocations for majority of nodes. Benchmarking 29 // also showed that static arrays are faster than maps for lookups until n=8. 30 const numStaticChildren = 5 31 32 // Node is a node on the filesystem tree. A Node is shared by all the 33 // ControlFDs opened on that position. For a given Server, there will only be 34 // one Node for a given filesystem position. 35 // 36 // Reference Model: 37 // - Each node holds a ref on its parent for its entire lifetime. 38 type Node struct { 39 // node's ref count is protected by its parent's childrenMu. 40 nodeRefs 41 42 // opMu synchronizes high level operations on this path. 43 // 44 // It is used to ensure the following which are important for security: 45 // * This node's data is protected by opMu. So all operations that change its 46 // data should hold opMu for writing. For example: write, setstat, setxattr, 47 // etc. This entails that if this node represents a directory, creation and 48 // deletion operations happening directly under this directory must lock 49 // opMu for writing. All operations accessing data must hold opMu for 50 // reading. This is to avoid the can of worms that open when creation and 51 // deletion are allowed to race. This prevents any walks from occurring 52 // during creation or deletion. 53 // * When this node is being deleted, the deletion handler must hold opMu for 54 // writing. This ensures that there are no concurrent operations going on 55 // this node while it is being deleted and potentially being replaced with 56 // something hazardous. 57 // 58 // A useful consequence of the above is that holding opMu for reading 59 // guarantees that the Server can not change Nodes on the path until this 60 // Node. For instance, if the grandparent needs to be renamed or deleted, 61 // the client must first delete this node to avoid ENOTEMPTY error. Deleting 62 // this node is not possible while opMu is read locked. 63 opMu sync.RWMutex 64 65 // deleted indicates whether the backing file has been unlinked. This can be 66 // used to deny operations on FDs on this Node after deletion because it is 67 // not safe for FD implementations to do host walks up to this position 68 // anymore. This node may have been replaced with something hazardous. 69 // deleted is protected by opMu. deleted must only be accessed/mutated using 70 // atomics; see markDeletedRecursive for more details. 71 deleted atomicbitops.Uint32 72 73 // name is the name of the file represented by this Node in parent. If this 74 // FD represents the root directory, then name is an empty string. name is 75 // protected by the backing server's rename mutex. 76 name string 77 78 // parent is this parent node which tracks this node as a child. parent is 79 // protected by the backing server's rename mutex. 80 parent *Node 81 82 // controlFDs is a linked list of all the ControlFDs opened on this node. 83 // Prefer this over a slice to avoid additional allocations. Each ControlFD 84 // is an implicit linked list node so there are no additional allocations 85 // needed to maintain the linked list. 86 controlFDsMu sync.Mutex 87 controlFDs controlFDList 88 89 // Here is a performance hack. Past experience has shown that map allocations 90 // on each node for tracking children costs a lot of memory. More small 91 // allocations also fragment memory. To save allocations, statically track 92 // upto numStaticChildren children using hardcoded pointers. If more children 93 // are inserted then move to a map. Use dynamicChildren iff it is non-nil. 94 95 // The following fields are protected by childrenMu. 96 childrenMu sync.Mutex 97 staticChildren [numStaticChildren]struct { 98 name string 99 node *Node 100 } 101 dynamicChildren map[string]*Node 102 } 103 104 // DecRef implements refs.RefCounter.DecRef. Note that the context 105 // parameter should never be used. It exists solely to comply with the 106 // refs.RefCounter interface. 107 // 108 // Precondition: server's rename mutex must be at least read locked. 109 func (n *Node) DecRef(context.Context) { 110 if n.parent == nil { 111 n.nodeRefs.DecRef(nil) 112 return 113 } 114 // If this is the only ref on node then it will need to be destroyed. 115 n.parent.childrenMu.Lock() 116 deleted := false 117 n.nodeRefs.DecRef(func() { 118 n.parent.removeChildLocked(n.name) 119 deleted = true 120 }) 121 n.parent.childrenMu.Unlock() 122 if deleted { 123 // Drop ref on parent. Keep Decref call lock free for scalability. 124 n.parent.DecRef(nil) 125 } 126 } 127 128 // InitLocked must be called before first use of fd. 129 // 130 // Precondition: parent.childrenMu is locked. 131 // 132 // Postconditions: A ref on n is transferred to the caller. 133 func (n *Node) InitLocked(name string, parent *Node) { 134 n.nodeRefs.InitRefs() 135 n.name = name 136 n.parent = parent 137 if parent != nil { 138 parent.IncRef() 139 parent.insertChildLocked(name, n) 140 } 141 } 142 143 // LookupChildLocked looks up for a child with given name. Returns nil if child 144 // does not exist. 145 // 146 // Preconditions: childrenMu is locked. 147 func (n *Node) LookupChildLocked(name string) *Node { 148 if n.dynamicChildren != nil { 149 return n.dynamicChildren[name] 150 } 151 152 for i := 0; i < numStaticChildren; i++ { 153 if n.staticChildren[i].name == name { 154 return n.staticChildren[i].node 155 } 156 } 157 return nil 158 } 159 160 // WithChildrenMu executes fn with n.childrenMu locked. 161 func (n *Node) WithChildrenMu(fn func()) { 162 n.childrenMu.Lock() 163 defer n.childrenMu.Unlock() 164 fn() 165 } 166 167 // FilePath returns the absolute path of the backing file. This is an expensive 168 // operation. The returned path should be free of any intermediate symlinks 169 // because all internal (non-leaf) nodes are directories. 170 // 171 // Precondition: 172 // - server's rename mutex must be at least read locked. Calling handlers must 173 // at least have read concurrency guarantee from the server. 174 func (n *Node) FilePath() string { 175 // Walk upwards and prepend name to res. 176 var res fspath.Builder 177 for n.parent != nil { 178 res.PrependComponent(n.name) 179 n = n.parent 180 } 181 // n is the root node. 182 res.PrependByte('/') 183 return res.String() 184 } 185 186 func (n *Node) isDeleted() bool { 187 return n.deleted.Load() != 0 188 } 189 190 func (n *Node) removeFD(fd *ControlFD) { 191 n.controlFDsMu.Lock() 192 defer n.controlFDsMu.Unlock() 193 n.controlFDs.Remove(fd) 194 } 195 196 func (n *Node) insertFD(fd *ControlFD) { 197 n.controlFDsMu.Lock() 198 defer n.controlFDsMu.Unlock() 199 n.controlFDs.PushBack(fd) 200 } 201 202 func (n *Node) forEachFD(fn func(*ControlFD)) { 203 n.controlFDsMu.Lock() 204 defer n.controlFDsMu.Unlock() 205 for fd := n.controlFDs.Front(); fd != nil; fd = fd.Next() { 206 fn(fd) 207 } 208 } 209 210 // removeChildLocked removes child with given name from n and returns the 211 // removed child. Returns nil if no such child existed. 212 // 213 // Precondition: childrenMu is locked. 214 func (n *Node) removeChildLocked(name string) *Node { 215 if n.dynamicChildren != nil { 216 toRemove := n.dynamicChildren[name] 217 delete(n.dynamicChildren, name) 218 return toRemove 219 } 220 221 for i := 0; i < numStaticChildren; i++ { 222 if n.staticChildren[i].name == name { 223 toRemove := n.staticChildren[i].node 224 n.staticChildren[i].name = "" 225 n.staticChildren[i].node = nil 226 return toRemove 227 } 228 } 229 return nil 230 } 231 232 // insertChildLocked inserts child into n. It does not check for duplicates. 233 // 234 // Precondition: childrenMu is locked. 235 func (n *Node) insertChildLocked(name string, child *Node) { 236 // Try to insert statically first if staticChildren is still being used. 237 if n.dynamicChildren == nil { 238 for i := 0; i < numStaticChildren; i++ { 239 if n.staticChildren[i].node == nil { 240 n.staticChildren[i].node = child 241 n.staticChildren[i].name = name 242 return 243 } 244 } 245 246 // Ran out of static space. Need to start inserting dynamically. 247 // Shift everything to the map. 248 n.dynamicChildren = make(map[string]*Node) 249 for i := 0; i < numStaticChildren; i++ { 250 // From above loop we know all staticChildren entries are non-nil. 251 n.dynamicChildren[n.staticChildren[i].name] = n.staticChildren[i].node 252 n.staticChildren[i].name = "" 253 n.staticChildren[i].node = nil 254 } 255 } 256 257 n.dynamicChildren[name] = child 258 } 259 260 func (n *Node) forEachChild(fn func(*Node)) { 261 n.childrenMu.Lock() 262 defer n.childrenMu.Unlock() 263 264 if n.dynamicChildren != nil { 265 for _, child := range n.dynamicChildren { 266 fn(child) 267 } 268 return 269 } 270 271 for i := 0; i < numStaticChildren; i++ { 272 if n.staticChildren[i].node != nil { 273 fn(n.staticChildren[i].node) 274 } 275 } 276 } 277 278 // Precondition: opMu must be locked for writing on the root node being marked 279 // as deleted. 280 func (n *Node) markDeletedRecursive() { 281 n.deleted.Store(1) 282 283 // No need to hold opMu for children as it introduces lock ordering issues 284 // because forEachChild locks childrenMu. Locking opMu after childrenMu 285 // violates the lock ordering. Anyway if a directory is being deleted, it 286 // must not have children. The client must have already deleted the entire 287 // subtree. If the client did not delete this subtree nodes, then the subtree 288 // was deleted externally and there is not much we can do. This is best 289 // effort work to mark the subtree as deleted. 290 n.forEachChild(func(child *Node) { 291 child.markDeletedRecursive() 292 }) 293 }