github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/fsimpl/erofs/erofs.go (about) 1 // Copyright 2023 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package erofs implements erofs. 16 package erofs 17 18 import ( 19 "os" 20 "runtime" 21 "strconv" 22 "sync" 23 "sync/atomic" 24 25 "github.com/metacubex/gvisor/pkg/abi/linux" 26 "github.com/metacubex/gvisor/pkg/cleanup" 27 "github.com/metacubex/gvisor/pkg/context" 28 "github.com/metacubex/gvisor/pkg/erofs" 29 "github.com/metacubex/gvisor/pkg/errors/linuxerr" 30 "github.com/metacubex/gvisor/pkg/sentry/kernel/auth" 31 "github.com/metacubex/gvisor/pkg/sentry/memmap" 32 "github.com/metacubex/gvisor/pkg/sentry/vfs" 33 ) 34 35 // Name is the filesystem name. It is part of the interface used by users, 36 // e.g. via annotations, and shouldn't change. 37 const Name = "erofs" 38 39 // Mount option names for EROFS. 40 const ( 41 moptImageFD = "ifd" 42 ) 43 44 // FilesystemType implements vfs.FilesystemType. 45 // 46 // +stateify savable 47 type FilesystemType struct{} 48 49 // filesystem implements vfs.FilesystemImpl. 50 // 51 // +stateify savable 52 type filesystem struct { 53 vfsfs vfs.Filesystem 54 55 // Immutable options. 56 mopts string 57 iopts InternalFilesystemOptions 58 59 // devMinor is the filesystem's minor device number. devMinor is immutable. 60 devMinor uint32 61 62 // root is the root dentry. root is immutable. 63 root *dentry 64 65 // image is the EROFS image. image is immutable. 66 image *erofs.Image 67 68 // mf implements memmap.File for this image. 69 mf imageMemmapFile 70 71 // inodeBuckets contains the inodes in use. Multiple buckets are used to 72 // reduce the lock contention. Bucket is chosen based on the hash calculation 73 // on nid in filesystem.inodeBucket. 74 inodeBuckets []inodeBucket 75 } 76 77 // InternalFilesystemOptions may be passed as 78 // vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem. 79 // 80 // +stateify savable 81 type InternalFilesystemOptions struct { 82 // If UniqueID is non-empty, it is an opaque string used to reassociate the 83 // filesystem with a new image FD during restoration from checkpoint. 84 UniqueID vfs.RestoreID 85 } 86 87 // Name implements vfs.FilesystemType.Name. 88 func (FilesystemType) Name() string { 89 return Name 90 } 91 92 // Release implements vfs.FilesystemType.Release. 93 func (FilesystemType) Release(ctx context.Context) {} 94 95 // GetFilesystem implements vfs.FilesystemType.GetFilesystem. 96 func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { 97 mopts := vfs.GenericParseMountOptions(opts.Data) 98 99 var cu cleanup.Cleanup 100 defer cu.Clean() 101 102 fd, err := getFDFromMountOptionsMap(ctx, mopts) 103 if err != nil { 104 return nil, nil, err 105 } 106 107 f := os.NewFile(uintptr(fd), "EROFS image file") 108 image, err := erofs.OpenImage(f) 109 if err != nil { 110 f.Close() 111 return nil, nil, err 112 } 113 cu.Add(func() { image.Close() }) 114 115 iopts, ok := opts.InternalData.(InternalFilesystemOptions) 116 if opts.InternalData != nil && !ok { 117 ctx.Warningf("erofs.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted erofs.InternalFilesystemOptions", opts.InternalData) 118 return nil, nil, linuxerr.EINVAL 119 } 120 121 devMinor, err := vfsObj.GetAnonBlockDevMinor() 122 if err != nil { 123 return nil, nil, err 124 } 125 126 fs := &filesystem{ 127 mopts: opts.Data, 128 iopts: iopts, 129 image: image, 130 devMinor: devMinor, 131 mf: imageMemmapFile{image: image}, 132 } 133 fs.vfsfs.Init(vfsObj, &fstype, fs) 134 cu.Add(func() { fs.vfsfs.DecRef(ctx) }) 135 136 fs.inodeBuckets = make([]inodeBucket, runtime.GOMAXPROCS(0)) 137 for i := range fs.inodeBuckets { 138 fs.inodeBuckets[i].init() 139 } 140 141 root, err := fs.newDentry(image.RootNid()) 142 if err != nil { 143 return nil, nil, err 144 } 145 146 // Increase the root's reference count to 2. One reference is returned to 147 // the caller, and the other is held by fs. 148 root.IncRef() 149 fs.root = root 150 151 cu.Release() 152 return &fs.vfsfs, &root.vfsd, nil 153 } 154 155 func getFDFromMountOptionsMap(ctx context.Context, mopts map[string]string) (int, error) { 156 ifdstr, ok := mopts[moptImageFD] 157 if !ok { 158 ctx.Warningf("erofs.getFDFromMountOptionsMap: image FD must be specified as '%s=<file descriptor>'", moptImageFD) 159 return -1, linuxerr.EINVAL 160 } 161 delete(mopts, moptImageFD) 162 163 ifd, err := strconv.Atoi(ifdstr) 164 if err != nil { 165 ctx.Warningf("erofs.getFDFromMountOptionsMap: invalid image FD: %s=%s", moptImageFD, ifdstr) 166 return -1, linuxerr.EINVAL 167 } 168 169 return ifd, nil 170 } 171 172 // Release implements vfs.FilesystemImpl.Release. 173 func (fs *filesystem) Release(ctx context.Context) { 174 // An extra reference was held by the filesystem on the root. 175 if fs.root != nil { 176 fs.root.DecRef(ctx) 177 } 178 fs.image.Close() 179 fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) 180 } 181 182 func (fs *filesystem) statFS() linux.Statfs { 183 blockSize := int64(fs.image.BlockSize()) 184 return linux.Statfs{ 185 Type: erofs.SuperBlockMagicV1, 186 NameLength: erofs.MaxNameLen, 187 BlockSize: blockSize, 188 FragmentSize: blockSize, 189 Blocks: uint64(fs.image.Blocks()), 190 } 191 } 192 193 // +stateify savable 194 type inodeBucket struct { 195 // mu protects inodeMap. 196 mu sync.RWMutex `state:"nosave"` 197 198 // inodeMap contains the inodes indexed by nid. 199 // +checklocks:mu 200 inodeMap map[uint64]*inode 201 } 202 203 func (ib *inodeBucket) init() { 204 ib.inodeMap = make(map[uint64]*inode) // +checklocksignore 205 } 206 207 // getInode returns the inode identified by nid. A reference on inode is also 208 // returned to caller. 209 func (ib *inodeBucket) getInode(nid uint64) *inode { 210 ib.mu.RLock() 211 defer ib.mu.RUnlock() 212 i := ib.inodeMap[nid] 213 if i != nil { 214 i.IncRef() 215 } 216 return i 217 } 218 219 // addInode adds the inode identified by nid into the bucket. It will first check 220 // whether the old inode exists. If not, it will call newInode() to get the new inode. 221 // The inode eventually saved in the bucket will be returned with a reference for caller. 222 func (ib *inodeBucket) addInode(nid uint64, newInode func() *inode) *inode { 223 ib.mu.Lock() 224 defer ib.mu.Unlock() 225 if i, ok := ib.inodeMap[nid]; ok { 226 i.IncRef() 227 return i 228 } 229 i := newInode() 230 ib.inodeMap[nid] = i 231 return i 232 } 233 234 // removeInode removes the inode identified by nid. 235 func (ib *inodeBucket) removeInode(nid uint64) { 236 ib.mu.Lock() 237 delete(ib.inodeMap, nid) 238 ib.mu.Unlock() 239 } 240 241 func (fs *filesystem) inodeBucket(nid uint64) *inodeBucket { 242 bucket := nid % uint64(len(fs.inodeBuckets)) 243 return &fs.inodeBuckets[bucket] 244 } 245 246 // inode represents a filesystem object. 247 // 248 // Each dentry holds a reference on the inode it represents. An inode will 249 // be dropped once its reference count reaches zero. We do not cache inodes 250 // directly. The caching policy is implemented on top of dentries. 251 // 252 // +stateify savable 253 type inode struct { 254 erofs.Inode 255 256 // inodeRefs is the reference count. 257 inodeRefs 258 259 // fs is the owning filesystem. 260 fs *filesystem 261 262 // dirMu protects dirents. dirents is immutable after creation. 263 dirMu sync.RWMutex `state:"nosave"` 264 // +checklocks:dirMu 265 dirents []vfs.Dirent `state:"nosave"` 266 267 // mapsMu protects mappings. 268 mapsMu sync.Mutex `state:"nosave"` 269 270 // mappings tracks the mappings of the file into memmap.MappingSpaces 271 // if this inode represents a regular file. 272 // +checklocks:mapsMu 273 mappings memmap.MappingSet 274 275 // locks supports POSIX and BSD style locks. 276 locks vfs.FileLocks 277 278 // Inotify watches for this inode. 279 watches vfs.Watches 280 } 281 282 // getInode returns the inode identified by nid. A reference on inode is also 283 // returned to caller. 284 func (fs *filesystem) getInode(nid uint64) (*inode, error) { 285 bucket := fs.inodeBucket(nid) 286 287 // Fast path, inode already exists. 288 if i := bucket.getInode(nid); i != nil { 289 return i, nil 290 } 291 292 // Slow path, create a new inode. 293 // 294 // Construct the underlying inode object from the image without taking 295 // the bucket lock first to reduce the contention. 296 ino, err := fs.image.Inode(nid) 297 if err != nil { 298 return nil, err 299 } 300 return bucket.addInode(nid, func() *inode { 301 i := &inode{ 302 Inode: ino, 303 fs: fs, 304 } 305 i.InitRefs() 306 return i 307 }), nil 308 309 } 310 311 // DecRef should be called when you're finished with an inode. 312 func (i *inode) DecRef(ctx context.Context) { 313 i.inodeRefs.DecRef(func() { 314 nid := i.Nid() 315 i.fs.inodeBucket(nid).removeInode(nid) 316 }) 317 } 318 319 func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { 320 return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(i.Mode()), auth.KUID(i.UID()), auth.KGID(i.GID())) 321 } 322 323 func (i *inode) statTo(stat *linux.Statx) { 324 stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | 325 linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE | 326 linux.STATX_BLOCKS | linux.STATX_ATIME | linux.STATX_CTIME | 327 linux.STATX_MTIME 328 stat.Blksize = i.fs.image.BlockSize() 329 stat.Nlink = i.Nlink() 330 stat.UID = i.UID() 331 stat.GID = i.GID() 332 stat.Mode = i.Mode() 333 stat.Ino = i.Nid() 334 stat.Size = i.Size() 335 stat.Blocks = (stat.Size + 511) / 512 336 stat.Mtime = linux.StatxTimestamp{ 337 Sec: int64(i.Mtime()), 338 Nsec: i.MtimeNsec(), 339 } 340 stat.Atime = stat.Mtime 341 stat.Ctime = stat.Mtime 342 stat.DevMajor = linux.UNNAMED_MAJOR 343 stat.DevMinor = i.fs.devMinor 344 } 345 346 func (i *inode) fileType() uint16 { 347 return i.Mode() & linux.S_IFMT 348 } 349 350 // dentry implements vfs.DentryImpl. 351 // 352 // The filesystem is read-only and currently we never drop the cached dentries 353 // until the filesystem is unmounted. The reference model works like this: 354 // 355 // - The initial reference count of each dentry is one, which is the reference 356 // held by the parent (so when the reference count is one, it also means that 357 // this is a cached dentry, i.e. not in use). 358 // 359 // - When a dentry is used (e.g. opened by someone), its reference count will 360 // be increased and the new reference is held by caller. 361 // 362 // - The reference count of root dentry is two. One reference is returned to 363 // the caller of `GetFilesystem()`, and the other is held by `fs`. 364 // 365 // TODO: This can lead to unbounded memory growth in sentry due to the ever-growing 366 // dentry tree. We should have a dentry LRU cache, similar to what fsimpl/gofer does. 367 // 368 // +stateify savable 369 type dentry struct { 370 vfsd vfs.Dentry 371 372 // dentryRefs is the reference count. 373 dentryRefs 374 375 // parent is this dentry's parent directory. If this dentry is 376 // a file system root, parent is nil. 377 parent atomic.Pointer[dentry] `state:".(*dentry)"` 378 379 // name is this dentry's name in its parent. If this dentry is 380 // a file system root, name is the empty string. 381 name string 382 383 // inode is the inode represented by this dentry. 384 inode *inode 385 386 // dirMu serializes changes to the dentry tree. 387 dirMu sync.RWMutex `state:"nosave"` 388 389 // childMap contains the mappings of child names to dentries if this 390 // dentry represents a directory. 391 // +checklocks:dirMu 392 childMap map[string]*dentry 393 } 394 395 // The caller is expected to handle dentry insertion into dentry tree. 396 func (fs *filesystem) newDentry(nid uint64) (*dentry, error) { 397 i, err := fs.getInode(nid) 398 if err != nil { 399 return nil, err 400 } 401 d := &dentry{ 402 inode: i, 403 } 404 d.InitRefs() 405 d.vfsd.Init(d) 406 return d, nil 407 } 408 409 // DecRef implements vfs.DentryImpl.DecRef. 410 func (d *dentry) DecRef(ctx context.Context) { 411 d.dentryRefs.DecRef(func() { 412 d.dirMu.Lock() 413 for _, c := range d.childMap { 414 c.DecRef(ctx) 415 } 416 d.childMap = nil 417 d.dirMu.Unlock() 418 d.inode.DecRef(ctx) 419 }) 420 } 421 422 // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. 423 func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) { 424 if d.inode.IsDir() { 425 events |= linux.IN_ISDIR 426 } 427 // The ordering below is important, Linux always notifies the parent first. 428 if parent := d.parent.Load(); parent != nil { 429 parent.inode.watches.Notify(ctx, d.name, events, cookie, et, false) 430 } 431 d.inode.watches.Notify(ctx, "", events, cookie, et, false) 432 } 433 434 // Watches implements vfs.DentryImpl.Watches. 435 func (d *dentry) Watches() *vfs.Watches { 436 return &d.inode.watches 437 } 438 439 // OnZeroWatches implements vfs.DentryImpl.OnZeroWatches. 440 func (d *dentry) OnZeroWatches(ctx context.Context) {} 441 442 func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { 443 ats := vfs.AccessTypesForOpenFlags(opts) 444 if err := d.inode.checkPermissions(rp.Credentials(), ats); err != nil { 445 return nil, err 446 } 447 448 switch d.inode.fileType() { 449 case linux.S_IFREG: 450 if ats&vfs.MayWrite != 0 { 451 return nil, linuxerr.EROFS 452 } 453 var fd regularFileFD 454 fd.LockFD.Init(&d.inode.locks) 455 if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil { 456 return nil, err 457 } 458 return &fd.vfsfd, nil 459 460 case linux.S_IFDIR: 461 // Can't open directories with O_CREAT. 462 if opts.Flags&linux.O_CREAT != 0 { 463 return nil, linuxerr.EISDIR 464 } 465 // Can't open directories writably. 466 if ats&vfs.MayWrite != 0 { 467 return nil, linuxerr.EISDIR 468 } 469 if opts.Flags&linux.O_DIRECT != 0 { 470 return nil, linuxerr.EINVAL 471 } 472 var fd directoryFD 473 fd.LockFD.Init(&d.inode.locks) 474 if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil { 475 return nil, err 476 } 477 return &fd.vfsfd, nil 478 479 case linux.S_IFLNK: 480 // Can't open symlinks without O_PATH, which is handled at the VFS layer. 481 return nil, linuxerr.ELOOP 482 483 default: 484 return nil, linuxerr.ENXIO 485 } 486 } 487 488 // +stateify savable 489 type fileDescription struct { 490 vfsfd vfs.FileDescription 491 vfs.FileDescriptionDefaultImpl 492 vfs.LockFD 493 494 lockLogging sync.Once `state:"nosave"` 495 } 496 497 func (fd *fileDescription) filesystem() *filesystem { 498 return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) 499 } 500 501 func (fd *fileDescription) dentry() *dentry { 502 return fd.vfsfd.Dentry().Impl().(*dentry) 503 } 504 505 func (fd *fileDescription) inode() *inode { 506 return fd.dentry().inode 507 } 508 509 // Stat implements vfs.FileDescriptionImpl.Stat. 510 func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { 511 var stat linux.Statx 512 fd.inode().statTo(&stat) 513 return stat, nil 514 } 515 516 // SetStat implements vfs.FileDescriptionImpl.SetStat. 517 func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { 518 return linuxerr.EROFS 519 } 520 521 // StatFS implements vfs.FileDescriptionImpl.StatFS. 522 func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { 523 return fd.filesystem().statFS(), nil 524 } 525 526 // ListXattr implements vfs.FileDescriptionImpl.ListXattr. 527 func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) { 528 return nil, linuxerr.ENOTSUP 529 } 530 531 // GetXattr implements vfs.FileDescriptionImpl.GetXattr. 532 func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) { 533 return "", linuxerr.ENOTSUP 534 } 535 536 // SetXattr implements vfs.FileDescriptionImpl.SetXattr. 537 func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error { 538 return linuxerr.EROFS 539 } 540 541 // RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr. 542 func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error { 543 return linuxerr.EROFS 544 } 545 546 // Sync implements vfs.FileDescriptionImpl.Sync. 547 func (*fileDescription) Sync(context.Context) error { 548 return nil 549 } 550 551 // Release implements vfs.FileDescriptionImpl.Release. 552 func (*fileDescription) Release(ctx context.Context) {}