github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/fsimpl/gofer/directory.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package gofer 16 17 import ( 18 "fmt" 19 20 "github.com/metacubex/gvisor/pkg/abi/linux" 21 "github.com/metacubex/gvisor/pkg/atomicbitops" 22 "github.com/metacubex/gvisor/pkg/context" 23 "github.com/metacubex/gvisor/pkg/errors/linuxerr" 24 "github.com/metacubex/gvisor/pkg/hostarch" 25 "github.com/metacubex/gvisor/pkg/sentry/kernel/auth" 26 "github.com/metacubex/gvisor/pkg/sentry/kernel/pipe" 27 "github.com/metacubex/gvisor/pkg/sentry/socket/unix/transport" 28 "github.com/metacubex/gvisor/pkg/sentry/vfs" 29 "github.com/metacubex/gvisor/pkg/sync" 30 ) 31 32 func (d *dentry) isDir() bool { 33 return d.fileType() == linux.S_IFDIR 34 } 35 36 // cacheNewChildLocked will cache the new child dentry, and will panic if a 37 // non-negative child is already cached. It is the caller's responsibility to 38 // check that the child does not exist before calling this method. 39 // 40 // Preconditions: 41 // - filesystem.renameMu must be locked. 42 // - If the addition to the dentry tree is due to a read-only operation (like 43 // Walk), then d.opMu must be held for reading. Otherwise d.opMu must be 44 // held for writing. 45 // - d.childrenMu must be locked. 46 // - d.isDir(). 47 // - child must be a newly-created dentry that has never had a parent. 48 // - d.children[name] must be unset or nil (a "negative child") 49 // 50 // +checklocksread:d.opMu 51 // +checklocks:d.childrenMu 52 func (d *dentry) cacheNewChildLocked(child *dentry, name string) { 53 d.IncRef() // reference held by child on its parent 54 child.parent.Store(d) 55 child.name = name 56 if d.children == nil { 57 d.children = make(map[string]*dentry) 58 } else if c, ok := d.children[name]; ok { 59 if c != nil { 60 panic(fmt.Sprintf("cacheNewChildLocked collision; child with name=%q already cached", name)) 61 } 62 63 // Cached child is negative. OK to cache over, but we must 64 // update the count of negative children. 65 d.negativeChildren-- 66 } 67 d.children[name] = child 68 } 69 70 // Preconditions: 71 // - d.childrenMu must be locked. 72 // - d.isDir(). 73 // - name is not already a negative entry. 74 // 75 // +checklocks:d.childrenMu 76 func (d *dentry) cacheNegativeLookupLocked(name string) { 77 // Don't cache negative lookups if InteropModeShared is in effect (since 78 // this makes remote lookup unavoidable), or if d.isSynthetic() (in which 79 // case the only files in the directory are those for which a dentry exists 80 // in d.children). Instead, just delete any previously-cached dentry. 81 if d.fs.opts.interop == InteropModeShared || d.isSynthetic() { 82 delete(d.children, name) 83 return 84 } 85 if d.children == nil { 86 d.children = make(map[string]*dentry) 87 } 88 d.children[name] = nil 89 d.negativeChildren++ 90 91 if !d.negativeChildrenCache.isInited() { 92 // Initializing cache with all negative children name at the first time 93 // that negativeChildren increase upto max. 94 if d.negativeChildren >= maxCachedNegativeChildren { 95 d.negativeChildrenCache.init(maxCachedNegativeChildren) 96 for childName, child := range d.children { 97 if child == nil { 98 d.negativeChildrenCache.add(childName) 99 } 100 } 101 } 102 } else if victim := d.negativeChildrenCache.add(name); victim != "" { 103 // If victim is a negative entry in d.children, delete it. 104 if child, ok := d.children[victim]; ok && child == nil { 105 delete(d.children, victim) 106 d.negativeChildren-- 107 } 108 } 109 } 110 111 type createSyntheticOpts struct { 112 name string 113 mode linux.FileMode 114 kuid auth.KUID 115 kgid auth.KGID 116 117 // The endpoint for a synthetic socket. endpoint should be nil if the file 118 // being created is not a socket. 119 endpoint transport.BoundEndpoint 120 121 // pipe should be nil if the file being created is not a pipe. 122 pipe *pipe.VFSPipe 123 } 124 125 // newSyntheticDentry creates a synthetic file with the given name. 126 func (fs *filesystem) newSyntheticDentry(opts *createSyntheticOpts) *dentry { 127 now := fs.clock.Now().Nanoseconds() 128 child := &dentry{ 129 refs: atomicbitops.FromInt64(1), // held by parent. 130 fs: fs, 131 ino: fs.nextIno(), 132 mode: atomicbitops.FromUint32(uint32(opts.mode)), 133 uid: atomicbitops.FromUint32(uint32(opts.kuid)), 134 gid: atomicbitops.FromUint32(uint32(opts.kgid)), 135 blockSize: atomicbitops.FromUint32(hostarch.PageSize), // arbitrary 136 atime: atomicbitops.FromInt64(now), 137 mtime: atomicbitops.FromInt64(now), 138 ctime: atomicbitops.FromInt64(now), 139 btime: atomicbitops.FromInt64(now), 140 readFD: atomicbitops.FromInt32(-1), 141 writeFD: atomicbitops.FromInt32(-1), 142 mmapFD: atomicbitops.FromInt32(-1), 143 nlink: atomicbitops.FromUint32(2), 144 } 145 switch opts.mode.FileType() { 146 case linux.S_IFDIR: 147 // Nothing else needs to be done. 148 case linux.S_IFSOCK: 149 child.endpoint = opts.endpoint 150 case linux.S_IFIFO: 151 child.pipe = opts.pipe 152 default: 153 panic(fmt.Sprintf("failed to create synthetic file of unrecognized type: %v", opts.mode.FileType())) 154 } 155 child.init(nil /* impl */) 156 return child 157 } 158 159 // Preconditions: 160 // - d.childrenMu must be locked. 161 // 162 // +checklocks:d.childrenMu 163 func (d *dentry) clearDirentsLocked() { 164 d.dirents = nil 165 d.childrenSet = nil 166 } 167 168 // +stateify savable 169 type directoryFD struct { 170 fileDescription 171 vfs.DirectoryFileDescriptionDefaultImpl 172 173 mu sync.Mutex `state:"nosave"` 174 off int64 175 dirents []vfs.Dirent 176 } 177 178 // Release implements vfs.FileDescriptionImpl.Release. 179 func (fd *directoryFD) Release(context.Context) { 180 } 181 182 // IterDirents implements vfs.FileDescriptionImpl.IterDirents. 183 func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { 184 fd.mu.Lock() 185 defer fd.mu.Unlock() 186 187 d := fd.dentry() 188 if fd.dirents == nil { 189 ds, err := d.getDirents(ctx) 190 if err != nil { 191 return err 192 } 193 fd.dirents = ds 194 } 195 196 if d.cachedMetadataAuthoritative() { 197 d.touchAtime(fd.vfsfd.Mount()) 198 } 199 200 for fd.off < int64(len(fd.dirents)) { 201 if err := cb.Handle(fd.dirents[fd.off]); err != nil { 202 return err 203 } 204 fd.off++ 205 } 206 return nil 207 } 208 209 // Preconditions: 210 // - d.isDir(). 211 // - There exists at least one directoryFD representing d. 212 func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { 213 // NOTE(b/135560623): 9P2000.L's readdir does not specify behavior in the 214 // presence of concurrent mutation of an iterated directory, so 215 // implementations may duplicate or omit entries in this case, which 216 // violates POSIX semantics. Thus we read all directory entries while 217 // holding d.opMu to exclude directory mutations. (Note that it is 218 // impossible for the client to exclude concurrent mutation from other 219 // remote filesystem users. Since there is no way to detect if the server 220 // has incorrectly omitted directory entries, we simply assume that the 221 // server is well-behaved under InteropModeShared.) This is inconsistent 222 // with Linux (which appears to assume that directory fids have the correct 223 // semantics, and translates struct file_operations::readdir calls directly 224 // to readdir RPCs), but is consistent with VFS1. 225 226 // filesystem.renameMu is needed for d.parent, and must be locked before 227 // d.opMu. 228 d.fs.renameMu.RLock() 229 defer d.fs.renameMu.RUnlock() 230 d.opMu.RLock() 231 defer d.opMu.RUnlock() 232 233 // d.childrenMu must be locked after d.opMu and held for the entire 234 // function. This synchronizes concurrent getDirents() attempts. 235 // getdents(2) advances the file offset. To get complete results from 236 // multiple getdents(2) calls, the directory FD's offset needs to be 237 // protected. 238 d.childrenMu.Lock() 239 defer d.childrenMu.Unlock() 240 241 if d.dirents != nil { 242 return d.dirents, nil 243 } 244 245 // It's not clear if 9P2000.L's readdir is expected to return "." and "..", 246 // so we generate them here. 247 parent := genericParentOrSelf(d) 248 dirents := []vfs.Dirent{ 249 { 250 Name: ".", 251 Type: linux.DT_DIR, 252 Ino: uint64(d.ino), 253 NextOff: 1, 254 }, 255 { 256 Name: "..", 257 Type: uint8(parent.mode.Load() >> 12), 258 Ino: uint64(parent.ino), 259 NextOff: 2, 260 }, 261 } 262 var realChildren map[string]struct{} 263 if !d.isSynthetic() { 264 if d.syntheticChildren != 0 && d.fs.opts.interop == InteropModeShared { 265 // Record the set of children d actually has so that we don't emit 266 // duplicate entries for synthetic children. 267 realChildren = make(map[string]struct{}) 268 } 269 d.handleMu.RLock() 270 if !d.isReadHandleOk() { 271 // This should not be possible because a readable handle should 272 // have been opened when the calling directoryFD was opened. 273 panic("gofer.dentry.getDirents called without a readable handle") 274 } 275 err := d.getDirentsLocked(ctx, func(name string, key inoKey, dType uint8) { 276 dirent := vfs.Dirent{ 277 Name: name, 278 Ino: d.fs.inoFromKey(key), 279 NextOff: int64(len(dirents) + 1), 280 Type: dType, 281 } 282 dirents = append(dirents, dirent) 283 if realChildren != nil { 284 realChildren[name] = struct{}{} 285 } 286 }) 287 d.handleMu.RUnlock() 288 if err != nil { 289 return nil, err 290 } 291 } 292 293 // Emit entries for synthetic children. 294 if d.syntheticChildren != 0 { 295 for _, child := range d.children { 296 if child == nil || !child.isSynthetic() { 297 continue 298 } 299 if _, ok := realChildren[child.name]; ok { 300 continue 301 } 302 dirents = append(dirents, vfs.Dirent{ 303 Name: child.name, 304 Type: uint8(child.mode.Load() >> 12), 305 Ino: uint64(child.ino), 306 NextOff: int64(len(dirents) + 1), 307 }) 308 } 309 } 310 // Cache dirents for future directoryFDs if permitted. 311 if d.cachedMetadataAuthoritative() { 312 d.dirents = dirents 313 d.childrenSet = make(map[string]struct{}, len(dirents)) 314 for _, dirent := range d.dirents { 315 d.childrenSet[dirent.Name] = struct{}{} 316 } 317 } 318 return dirents, nil 319 } 320 321 // Seek implements vfs.FileDescriptionImpl.Seek. 322 func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { 323 fd.mu.Lock() 324 defer fd.mu.Unlock() 325 326 switch whence { 327 case linux.SEEK_SET: 328 if offset < 0 { 329 return 0, linuxerr.EINVAL 330 } 331 if offset == 0 { 332 // Ensure that the next call to fd.IterDirents() calls 333 // fd.dentry().getDirents(). 334 fd.dirents = nil 335 } 336 fd.off = offset 337 return fd.off, nil 338 case linux.SEEK_CUR: 339 offset += fd.off 340 if offset < 0 { 341 return 0, linuxerr.EINVAL 342 } 343 // Don't clear fd.dirents in this case, even if offset == 0. 344 fd.off = offset 345 return fd.off, nil 346 default: 347 return 0, linuxerr.EINVAL 348 } 349 } 350 351 // Sync implements vfs.FileDescriptionImpl.Sync. 352 func (fd *directoryFD) Sync(ctx context.Context) error { 353 return fd.dentry().syncRemoteFile(ctx) 354 }