github.com/sagernet/gvisor@v0.0.0-20240428053021-e691de28565f/pkg/erofs/erofs.go (about) 1 // Copyright 2023 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package erofs provides the ability to access the contents in an EROFS [1] image. 16 // 17 // The design principle of this package is that, it will just provide the ability 18 // to access the contents in the image, and it will never cache any objects internally. 19 // The whole disk image is mapped via a read-only/shared mapping, and it relies on 20 // host kernel to cache the blocks/pages transparently. 21 // 22 // [1] https://docs.kernel.org/filesystems/erofs.html 23 package erofs 24 25 import ( 26 "bytes" 27 "fmt" 28 "hash/crc32" 29 "os" 30 31 "golang.org/x/sys/unix" 32 "github.com/sagernet/gvisor/pkg/abi/linux" 33 "github.com/sagernet/gvisor/pkg/cleanup" 34 "github.com/sagernet/gvisor/pkg/errors/linuxerr" 35 "github.com/sagernet/gvisor/pkg/gohacks" 36 "github.com/sagernet/gvisor/pkg/hostarch" 37 "github.com/sagernet/gvisor/pkg/log" 38 "github.com/sagernet/gvisor/pkg/marshal" 39 "github.com/sagernet/gvisor/pkg/safemem" 40 ) 41 42 const ( 43 // Definitions for superblock. 44 SuperBlockMagicV1 = 0xe0f5e1e2 45 SuperBlockOffset = 1024 46 47 // Inode slot size in bit shift. 48 InodeSlotBits = 5 49 50 // Max file name length. 51 MaxNameLen = 255 52 ) 53 54 // Bit definitions for Inode*::Format. 55 const ( 56 InodeLayoutBit = 0 57 InodeLayoutBits = 1 58 59 InodeDataLayoutBit = 1 60 InodeDataLayoutBits = 3 61 ) 62 63 // Inode layouts. 64 const ( 65 InodeLayoutCompact = 0 66 InodeLayoutExtended = 1 67 ) 68 69 // Inode data layouts. 70 const ( 71 InodeDataLayoutFlatPlain = iota 72 InodeDataLayoutFlatCompressionLegacy 73 InodeDataLayoutFlatInline 74 InodeDataLayoutFlatCompression 75 InodeDataLayoutChunkBased 76 InodeDataLayoutMax 77 ) 78 79 // Features w/ backward compatibility. 80 // This is not exhaustive, unused features are not listed. 81 const ( 82 FeatureCompatSuperBlockChecksum = 0x00000001 83 ) 84 85 // Features w/o backward compatibility. 86 // 87 // Any features that aren't in FeatureIncompatSupported are incompatible 88 // with this implementation. 89 // 90 // This is not exhaustive, unused features are not listed. 91 const ( 92 FeatureIncompatSupported = 0x0 93 ) 94 95 // Sizes of on-disk structures in bytes. 96 const ( 97 SuperBlockSize = 128 98 InodeCompactSize = 32 99 InodeExtendedSize = 64 100 DirentSize = 12 101 ) 102 103 // SuperBlock represents on-disk superblock. 104 // 105 // +marshal 106 // +stateify savable 107 type SuperBlock struct { 108 Magic uint32 109 Checksum uint32 110 FeatureCompat uint32 111 BlockSizeBits uint8 112 ExtSlots uint8 113 RootNid uint16 114 Inodes uint64 115 BuildTime uint64 116 BuildTimeNsec uint32 117 Blocks uint32 118 MetaBlockAddr uint32 119 XattrBlockAddr uint32 120 UUID [16]uint8 121 VolumeName [16]uint8 122 FeatureIncompat uint32 123 Union1 uint16 124 ExtraDevices uint16 125 DevTableSlotOff uint16 126 Reserved [38]uint8 127 } 128 129 // BlockSize returns the block size. 130 func (sb *SuperBlock) BlockSize() uint32 { 131 return 1 << sb.BlockSizeBits 132 } 133 134 // BlockAddrToOffset converts block addr to the offset in image file. 135 func (sb *SuperBlock) BlockAddrToOffset(addr uint32) uint64 { 136 return uint64(addr) << sb.BlockSizeBits 137 } 138 139 // MetaOffset returns the offset of metadata area in image file. 140 func (sb *SuperBlock) MetaOffset() uint64 { 141 return sb.BlockAddrToOffset(sb.MetaBlockAddr) 142 } 143 144 // NidToOffset converts inode number to the offset in image file. 145 func (sb *SuperBlock) NidToOffset(nid uint64) uint64 { 146 return sb.MetaOffset() + (nid << InodeSlotBits) 147 } 148 149 // InodeCompact represents 32-byte reduced form of on-disk inode. 150 // 151 // +marshal 152 type InodeCompact struct { 153 Format uint16 154 XattrCount uint16 155 Mode uint16 156 Nlink uint16 157 Size uint32 158 Reserved uint32 159 RawBlockAddr uint32 160 Ino uint32 161 UID uint16 162 GID uint16 163 Reserved2 uint32 164 } 165 166 // InodeExtended represents 64-byte complete form of on-disk inode. 167 // 168 // +marshal 169 type InodeExtended struct { 170 Format uint16 171 XattrCount uint16 172 Mode uint16 173 Reserved uint16 174 Size uint64 175 RawBlockAddr uint32 176 Ino uint32 177 UID uint32 178 GID uint32 179 Mtime uint64 180 MtimeNsec uint32 181 Nlink uint32 182 Reserved2 [16]uint8 183 } 184 185 // Dirent represents on-disk directory entry. 186 // 187 // +marshal 188 type Dirent struct { 189 NidLow uint32 190 NidHigh uint32 191 NameOff uint16 192 FileType uint8 193 Reserved uint8 194 } 195 196 // Nid returns the inode number of the inode referenced by this dirent. 197 func (d *Dirent) Nid() uint64 { 198 // EROFS on-disk structures are always in little endian. 199 // TODO: This implementation does not support big endian yet. 200 return (uint64(d.NidHigh) << 32) | uint64(d.NidLow) 201 } 202 203 // Image represents an open EROFS image. 204 // 205 // +stateify savable 206 type Image struct { 207 src *os.File `state:"nosave"` 208 bytes []byte `state:"nosave"` 209 sb SuperBlock 210 } 211 212 // OpenImage returns an Image providing access to the contents in the image file src. 213 // 214 // On success, the ownership of src is transferred to Image. 215 func OpenImage(src *os.File) (*Image, error) { 216 i := &Image{src: src} 217 218 var cu cleanup.Cleanup 219 defer cu.Clean() 220 221 stat, err := i.src.Stat() 222 if err != nil { 223 return nil, err 224 } 225 i.bytes, err = unix.Mmap(int(i.src.Fd()), 0, int(stat.Size()), unix.PROT_READ, unix.MAP_SHARED) 226 if err != nil { 227 return nil, err 228 } 229 cu.Add(func() { unix.Munmap(i.bytes) }) 230 231 if err := i.initSuperBlock(); err != nil { 232 return nil, err 233 } 234 cu.Release() 235 return i, nil 236 } 237 238 // Close closes the image. 239 func (i *Image) Close() { 240 unix.Munmap(i.bytes) 241 i.src.Close() 242 } 243 244 // SuperBlock returns a copy of the image's superblock. 245 func (i *Image) SuperBlock() SuperBlock { 246 return i.sb 247 } 248 249 // BlockSize returns the block size of this image. 250 func (i *Image) BlockSize() uint32 { 251 return i.sb.BlockSize() 252 } 253 254 // Blocks returns the total blocks of this image. 255 func (i *Image) Blocks() uint32 { 256 return i.sb.Blocks 257 } 258 259 // RootNid returns the root inode number of this image. 260 func (i *Image) RootNid() uint64 { 261 return uint64(i.sb.RootNid) 262 } 263 264 // initSuperBlock initializes the superblock of this image. 265 func (i *Image) initSuperBlock() error { 266 // i.sb is used in the hot path. Let's save a copy of the superblock. 267 if err := i.unmarshalAt(&i.sb, SuperBlockOffset); err != nil { 268 return fmt.Errorf("image size is too small") 269 } 270 271 if i.sb.Magic != SuperBlockMagicV1 { 272 return fmt.Errorf("unknown magic: 0x%x", i.sb.Magic) 273 } 274 275 if err := i.verifyChecksum(); err != nil { 276 return err 277 } 278 279 if featureIncompat := i.sb.FeatureIncompat & ^uint32(FeatureIncompatSupported); featureIncompat != 0 { 280 return fmt.Errorf("unsupported incompatible features detected: 0x%x", featureIncompat) 281 } 282 283 if i.BlockSize()%hostarch.PageSize != 0 { 284 return fmt.Errorf("unsupported block size: 0x%x", i.BlockSize()) 285 } 286 287 return nil 288 } 289 290 // verifyChecksum verifies the checksum of the superblock. 291 func (i *Image) verifyChecksum() error { 292 if i.sb.FeatureCompat&FeatureCompatSuperBlockChecksum == 0 { 293 return nil 294 } 295 296 sb := i.sb 297 sb.Checksum = 0 298 table := crc32.MakeTable(crc32.Castagnoli) 299 checksum := crc32.Checksum(marshal.Marshal(&sb), table) 300 301 off := SuperBlockOffset + uint64(i.sb.SizeBytes()) 302 if bytes, err := i.BytesAt(off, uint64(i.BlockSize())-off); err != nil { 303 return fmt.Errorf("image size is too small") 304 } else { 305 checksum = ^crc32.Update(checksum, table, bytes) 306 } 307 if checksum != i.sb.Checksum { 308 return fmt.Errorf("invalid checksum: 0x%x, expected: 0x%x", checksum, i.sb.Checksum) 309 } 310 311 return nil 312 } 313 314 // FD returns the host FD of underlying image file. 315 func (i *Image) FD() int { 316 return int(i.src.Fd()) 317 } 318 319 // checkRange checks whether the range [off, off+n) is valid. 320 func (i *Image) checkRange(off, n uint64) bool { 321 size := uint64(len(i.bytes)) 322 end := off + n 323 return off < size && off <= end && end <= size 324 } 325 326 // BytesAt returns the bytes at [off, off+n) of the image. 327 func (i *Image) BytesAt(off, n uint64) ([]byte, error) { 328 if ok := i.checkRange(off, n); !ok { 329 log.Warningf("Invalid byte range (off: 0x%x, n: 0x%x) for image (size: 0x%x)", off, n, len(i.bytes)) 330 return nil, linuxerr.EFAULT 331 } 332 return i.bytes[off : off+n], nil 333 } 334 335 // checkInodeAlignment checks whether off matches inode's alignment requirement. 336 func checkInodeAlignment(off uint64) bool { 337 // Each valid inode should be aligned with an inode slot, which is 338 // a fixed value (32 bytes). 339 return off&((1<<InodeSlotBits)-1) == 0 340 } 341 342 // inodeFormatAt returns the format of the inode at offset off within the 343 // memory backed by image. 344 func (i *Image) inodeFormatAt(off uint64) (uint16, error) { 345 if ok := checkInodeAlignment(off); !ok { 346 return 0, linuxerr.EFAULT 347 } 348 if ok := i.checkRange(off, 2); !ok { 349 return 0, linuxerr.EFAULT 350 } 351 return *(*uint16)(i.pointerAt(off)), nil 352 } 353 354 // inodeCompactAt returns a pointer to the compact inode at offset off within 355 // the memory backed by image. 356 func (i *Image) inodeCompactAt(off uint64) (*InodeCompact, error) { 357 if ok := checkInodeAlignment(off); !ok { 358 return nil, linuxerr.EFAULT 359 } 360 if ok := i.checkRange(off, InodeCompactSize); !ok { 361 return nil, linuxerr.EFAULT 362 } 363 return (*InodeCompact)(i.pointerAt(off)), nil 364 } 365 366 // inodeExtendedAt returns a pointer to the extended inode at offset off within 367 // the memory backed by image. 368 func (i *Image) inodeExtendedAt(off uint64) (*InodeExtended, error) { 369 if ok := checkInodeAlignment(off); !ok { 370 return nil, linuxerr.EFAULT 371 } 372 if ok := i.checkRange(off, InodeExtendedSize); !ok { 373 return nil, linuxerr.EFAULT 374 } 375 return (*InodeExtended)(i.pointerAt(off)), nil 376 } 377 378 // direntAt returns a pointer to the dirent at offset off within the memory 379 // backed by image. 380 func (i *Image) direntAt(off uint64) (*Dirent, error) { 381 // Each valid dirent should be aligned to 4 bytes. 382 if off&3 != 0 { 383 return nil, linuxerr.EFAULT 384 } 385 if ok := i.checkRange(off, DirentSize); !ok { 386 return nil, linuxerr.EFAULT 387 } 388 return (*Dirent)(i.pointerAt(off)), nil 389 } 390 391 // unmarshalAt deserializes data from the bytes at [off, off+n) of the image. 392 func (i *Image) unmarshalAt(data marshal.Marshallable, off uint64) error { 393 bytes, err := i.BytesAt(off, uint64(data.SizeBytes())) 394 if err != nil { 395 log.Warningf("Failed to deserialize %T from 0x%x.", data, off) 396 return err 397 } 398 data.UnmarshalUnsafe(bytes) 399 return nil 400 } 401 402 // Inode returns the inode identified by nid. 403 func (i *Image) Inode(nid uint64) (Inode, error) { 404 inode := Inode{ 405 image: i, 406 nid: nid, 407 } 408 409 off := i.sb.NidToOffset(nid) 410 if format, err := i.inodeFormatAt(off); err != nil { 411 return Inode{}, err 412 } else { 413 inode.format = format 414 } 415 416 var ( 417 rawBlockAddr uint32 418 inodeSize int 419 ) 420 421 switch layout := inode.Layout(); layout { 422 case InodeLayoutCompact: 423 ino, err := i.inodeCompactAt(off) 424 if err != nil { 425 return Inode{}, err 426 } 427 428 if ino.XattrCount != 0 { 429 log.Warningf("Unsupported xattr at inode (nid=%v)", nid) 430 return Inode{}, linuxerr.ENOTSUP 431 } 432 433 rawBlockAddr = ino.RawBlockAddr 434 inodeSize = ino.SizeBytes() 435 436 inode.size = uint64(ino.Size) 437 inode.nlink = uint32(ino.Nlink) 438 inode.mode = ino.Mode 439 inode.uid = uint32(ino.UID) 440 inode.gid = uint32(ino.GID) 441 inode.mtime = i.sb.BuildTime 442 inode.mtimeNsec = i.sb.BuildTimeNsec 443 444 case InodeLayoutExtended: 445 ino, err := i.inodeExtendedAt(off) 446 if err != nil { 447 return Inode{}, err 448 } 449 450 if ino.XattrCount != 0 { 451 log.Warningf("Unsupported xattr at inode (nid=%v)", nid) 452 return Inode{}, linuxerr.ENOTSUP 453 } 454 455 rawBlockAddr = ino.RawBlockAddr 456 inodeSize = ino.SizeBytes() 457 458 inode.size = ino.Size 459 inode.nlink = ino.Nlink 460 inode.mode = ino.Mode 461 inode.uid = ino.UID 462 inode.gid = ino.GID 463 inode.mtime = ino.Mtime 464 inode.mtimeNsec = ino.MtimeNsec 465 466 default: 467 log.Warningf("Unsupported layout 0x%x at inode (nid=%v)", layout, nid) 468 return Inode{}, linuxerr.ENOTSUP 469 } 470 471 blockSize := uint64(i.BlockSize()) 472 inode.blocks = (inode.size + (blockSize - 1)) / blockSize 473 474 switch dataLayout := inode.DataLayout(); dataLayout { 475 case InodeDataLayoutFlatInline: 476 // Check that whether the file data in the last block fits into 477 // the remaining room of the metadata block. 478 tailSize := inode.size & (blockSize - 1) 479 if tailSize == 0 || tailSize > blockSize-uint64(inodeSize) { 480 log.Warningf("Inline data not found or cross block boundary at inode (nid=%v)", nid) 481 return Inode{}, linuxerr.EUCLEAN 482 } 483 inode.idataOff = off + uint64(inodeSize) 484 fallthrough 485 486 case InodeDataLayoutFlatPlain: 487 inode.dataOff = i.sb.BlockAddrToOffset(rawBlockAddr) 488 489 default: 490 log.Warningf("Unsupported data layout 0x%x at inode (nid=%v)", dataLayout, nid) 491 return Inode{}, linuxerr.ENOTSUP 492 } 493 494 return inode, nil 495 } 496 497 // Inode represents in-memory inode object. 498 // 499 // +stateify savable 500 type Inode struct { 501 // image is the underlying image. Inode should not perform writable 502 // operations (e.g. Close()) on the image. 503 image *Image 504 505 // dataOff points to the data of this inode in the data blocks. 506 dataOff uint64 507 508 // idataOff points to the tail packing inline data of this inode 509 // if it's not zero in the metadata block. 510 idataOff uint64 511 512 // blocks indicates the count of blocks that store the data associated 513 // with this inode. It will count in the metadata block that includes 514 // the inline data as well. 515 blocks uint64 516 517 // format is the format of this inode. 518 format uint16 519 520 // Metadata. 521 mode uint16 522 nid uint64 523 size uint64 524 mtime uint64 525 mtimeNsec uint32 526 uid uint32 527 gid uint32 528 nlink uint32 529 } 530 531 // bitRange returns the bits within the range [bit, bit+bits) in value. 532 func bitRange(value, bit, bits uint16) uint16 { 533 return (value >> bit) & ((1 << bits) - 1) 534 } 535 536 // Layout returns the inode layout. 537 func (i *Inode) Layout() uint16 { 538 return bitRange(i.format, InodeLayoutBit, InodeLayoutBits) 539 } 540 541 // DataLayout returns the inode data layout. 542 func (i *Inode) DataLayout() uint16 { 543 return bitRange(i.format, InodeDataLayoutBit, InodeDataLayoutBits) 544 } 545 546 // IsRegular indicates whether i represents a regular file. 547 func (i *Inode) IsRegular() bool { 548 return i.mode&linux.S_IFMT == linux.S_IFREG 549 } 550 551 // IsDir indicates whether i represents a directory. 552 func (i *Inode) IsDir() bool { 553 return i.mode&linux.S_IFMT == linux.S_IFDIR 554 } 555 556 // IsCharDev indicates whether i represents a character device. 557 func (i *Inode) IsCharDev() bool { 558 return i.mode&linux.S_IFMT == linux.S_IFCHR 559 } 560 561 // IsBlockDev indicates whether i represents a block device. 562 func (i *Inode) IsBlockDev() bool { 563 return i.mode&linux.S_IFMT == linux.S_IFBLK 564 } 565 566 // IsFIFO indicates whether i represents a named pipe. 567 func (i *Inode) IsFIFO() bool { 568 return i.mode&linux.S_IFMT == linux.S_IFIFO 569 } 570 571 // IsSocket indicates whether i represents a socket. 572 func (i *Inode) IsSocket() bool { 573 return i.mode&linux.S_IFMT == linux.S_IFSOCK 574 } 575 576 // IsSymlink indicates whether i represents a symbolic link. 577 func (i *Inode) IsSymlink() bool { 578 return i.mode&linux.S_IFMT == linux.S_IFLNK 579 } 580 581 // Nid returns the inode number. 582 func (i *Inode) Nid() uint64 { 583 return i.nid 584 } 585 586 // Size returns the data size. 587 func (i *Inode) Size() uint64 { 588 return i.size 589 } 590 591 // Nlink returns the number of hard links. 592 func (i *Inode) Nlink() uint32 { 593 return i.nlink 594 } 595 596 // Mtime returns the time of last modification. 597 func (i *Inode) Mtime() uint64 { 598 return i.mtime 599 } 600 601 // MtimeNsec returns the nano second part of Mtime. 602 func (i *Inode) MtimeNsec() uint32 { 603 return i.mtimeNsec 604 } 605 606 // Mode returns the file type and permissions. 607 func (i *Inode) Mode() uint16 { 608 return i.mode 609 } 610 611 // UID returns the user ID of the owner. 612 func (i *Inode) UID() uint32 { 613 return i.uid 614 } 615 616 // GID returns the group ID of the owner. 617 func (i *Inode) GID() uint32 { 618 return i.gid 619 } 620 621 // DataOffset returns the data offset of this inode in image file. 622 func (i *Inode) DataOffset() (uint64, error) { 623 // TODO: We don't support regular files with inline data yet, which means the image 624 // should be created with the "-E noinline_data" option. The "-E noinline_data" option 625 // was introduced for the DAX feature support in Linux [1]. 626 // [1] https://github.com/erofs/erofs-utils/commit/60549d52c3b636f0ddd1d51b0c1517c1dee22595 627 if dataLayout := i.DataLayout(); dataLayout != InodeDataLayoutFlatPlain { 628 log.Warningf("Unsupported data layout 0x%x at inode (nid=%v)", dataLayout, i.Nid()) 629 return 0, linuxerr.ENOTSUP 630 } 631 return i.dataOff, nil 632 } 633 634 // Data returns the read-only file data of this inode. 635 func (i *Inode) Data() (safemem.BlockSeq, error) { 636 switch dataLayout := i.DataLayout(); dataLayout { 637 case InodeDataLayoutFlatPlain: 638 bytes, err := i.image.BytesAt(i.dataOff, i.size) 639 if err != nil { 640 return safemem.BlockSeq{}, err 641 } 642 return safemem.BlockSeqOf(safemem.BlockFromSafeSlice(bytes)), nil 643 644 case InodeDataLayoutFlatInline: 645 sl := make([]safemem.Block, 0, 2) 646 idataSize := i.size & (uint64(i.image.BlockSize()) - 1) 647 if i.size > idataSize { 648 if bytes, err := i.image.BytesAt(i.dataOff, i.size-idataSize); err != nil { 649 return safemem.BlockSeq{}, err 650 } else { 651 sl = append(sl, safemem.BlockFromSafeSlice(bytes)) 652 } 653 } 654 if bytes, err := i.image.BytesAt(i.idataOff, idataSize); err != nil { 655 return safemem.BlockSeq{}, err 656 } else { 657 sl = append(sl, safemem.BlockFromSafeSlice(bytes)) 658 } 659 return safemem.BlockSeqFromSlice(sl), nil 660 661 default: 662 log.Warningf("Unsupported data layout 0x%x at inode (nid=%v)", dataLayout, i.Nid()) 663 return safemem.BlockSeq{}, linuxerr.ENOTSUP 664 } 665 } 666 667 // blockData represents the information of the data in a block. 668 type blockData struct { 669 // base indicates the data offset within the image. 670 base uint64 671 // size indicates the data size. 672 size uint32 673 } 674 675 // valid indicates whether this is valid information about the data in a block. 676 func (b *blockData) valid() bool { 677 // The data offset within the image will never be zero. 678 return b.base > 0 679 } 680 681 // getBlockDataInfo returns the information of the data in the block identified by 682 // blockIdx of this inode. 683 // 684 // Precondition: blockIdx < i.blocks. 685 func (i *Inode) getBlockDataInfo(blockIdx uint64) blockData { 686 blockSize := i.image.BlockSize() 687 lastBlock := blockIdx == i.blocks-1 688 base := i.idataOff 689 if !lastBlock || base == 0 { 690 base = i.dataOff + blockIdx*uint64(blockSize) 691 } 692 size := blockSize 693 if lastBlock { 694 if tailSize := uint32(i.size) & (blockSize - 1); tailSize != 0 { 695 size = tailSize 696 } 697 } 698 return blockData{base, size} 699 } 700 701 // getDirentName returns the name of dirent d in the given block of this inode. 702 // 703 // The on-disk format of one block looks like this: 704 // 705 // ___________________________ 706 // / | 707 // / ______________|________________ 708 // / / | nameoff1 | nameoffN-1 709 // ____________.______________._______________v________________v__________ 710 // | dirent | dirent | ... | dirent | filename | filename | ... | filename | 711 // |___.0___|____1___|_____|___N-1__|____0_____|____1_____|_____|___N-1____| 712 // \ ^ 713 // \ | * could have 714 // \ | trailing '\0' 715 // \________________________| nameoff0 716 // Directory block 717 // 718 // The on-disk format of one directory looks like this: 719 // 720 // [ (block 1) dirent 1 | dirent 2 | dirent 3 | name 1 | name 2 | name 3 | optional padding ] 721 // [ (block 2) dirent 4 | dirent 5 | name 4 | name 5 | optional padding ] 722 // ... 723 // [ (block N) dirent M | dirent M+1 | name M | name M+1 | optional padding ] 724 // 725 // [ (metadata block) inode | optional fields | dirent M+2 | dirent M+3 | name M+2 | name M+3 | optional padding ] 726 // 727 // Refer: https://docs.kernel.org/filesystems/erofs.html#directories 728 func (i *Inode) getDirentName(d *Dirent, block blockData, lastDirent bool) ([]byte, error) { 729 var nameLen uint32 730 if lastDirent { 731 nameLen = block.size - uint32(d.NameOff) 732 } else { 733 nameLen = uint32(direntAfter(d).NameOff - d.NameOff) 734 } 735 if uint32(d.NameOff)+nameLen > block.size || nameLen > MaxNameLen || nameLen == 0 { 736 log.Warningf("Corrupted dirent at inode (nid=%v)", i.Nid()) 737 return nil, linuxerr.EUCLEAN 738 } 739 name, err := i.image.BytesAt(block.base+uint64(d.NameOff), uint64(nameLen)) 740 if err != nil { 741 return nil, err 742 } 743 if lastDirent { 744 // Optional padding may exist at the end of a block. 745 n := bytes.IndexByte(name, 0) 746 if n == 0 { 747 log.Warningf("Corrupted dirent at inode (nid=%v)", i.Nid()) 748 return nil, linuxerr.EUCLEAN 749 } 750 if n != -1 { 751 name = name[:n] 752 } 753 } 754 return name, nil 755 } 756 757 // getDirent0 returns a pointer to the first dirent in the given block of this inode. 758 func (i *Inode) getDirent0(block blockData) (*Dirent, error) { 759 d0, err := i.image.direntAt(block.base) 760 if err != nil { 761 return nil, err 762 } 763 if d0.NameOff < DirentSize || uint32(d0.NameOff) >= block.size { 764 log.Warningf("Invalid nameOff0 %v at inode (nid=%v)", d0.NameOff, i.Nid()) 765 return nil, linuxerr.EUCLEAN 766 } 767 return d0, nil 768 } 769 770 // Lookup looks up a child by the name. The child inode number will be returned on success. 771 func (i *Inode) Lookup(name string) (uint64, error) { 772 if !i.IsDir() { 773 return 0, linuxerr.ENOTDIR 774 } 775 776 // Currently (Go 1.21), there is no safe and efficient way to do three-way 777 // string comparisons, so let's convert the string to a byte slice first. 778 nameBytes := gohacks.ImmutableBytesFromString(name) 779 780 // In EROFS, all directory entries are _strictly_ recorded in alphabetical 781 // order. The lookup is done by directly performing binary search on the 782 // disk data similar to what Linux does in fs/erofs/namei.c:erofs_namei(). 783 var ( 784 targetBlock blockData 785 targetNumDirents uint16 786 ) 787 788 // Find the block that may contain the target dirent first. 789 bLeft, bRight := int64(0), int64(i.blocks)-1 790 for bLeft <= bRight { 791 // Cast to uint64 to avoid overflow. 792 mid := uint64(bLeft+bRight) >> 1 793 block := i.getBlockDataInfo(mid) 794 d0, err := i.getDirent0(block) 795 if err != nil { 796 return 0, err 797 } 798 numDirents := d0.NameOff / DirentSize 799 d0Name, err := i.getDirentName(d0, block, numDirents == 1) 800 if err != nil { 801 return 0, err 802 } 803 switch bytes.Compare(nameBytes, d0Name) { 804 case 0: 805 // Found the target dirent. 806 return d0.Nid(), nil 807 case 1: 808 // name > d0Name, this block may contain the target dirent. 809 targetBlock = block 810 targetNumDirents = numDirents 811 bLeft = int64(mid) + 1 812 case -1: 813 // name < d0Name, this is not the block we're looking for. 814 bRight = int64(mid) - 1 815 } 816 } 817 818 if !targetBlock.valid() { 819 // The target block was not found. 820 return 0, linuxerr.ENOENT 821 } 822 823 // Find the target dirent in the target block. Note that, as the 0th dirent 824 // has already been checked during the block binary search, we don't need to 825 // check it again and can define dLeft/dRight as unsigned types. 826 dLeft, dRight := uint16(1), targetNumDirents-1 827 for dLeft <= dRight { 828 // The sum will never lead to a uint16 overflow, as the maximum value of 829 // the operands is MaxUint16/DirentSize. 830 mid := (dLeft + dRight) >> 1 831 direntOff := targetBlock.base + uint64(mid)*DirentSize 832 d, err := i.image.direntAt(direntOff) 833 if err != nil { 834 return 0, err 835 } 836 dName, err := i.getDirentName(d, targetBlock, mid == targetNumDirents-1) 837 if err != nil { 838 return 0, err 839 } 840 switch bytes.Compare(nameBytes, dName) { 841 case 0: 842 // Found the target dirent. 843 return d.Nid(), nil 844 case 1: 845 // name > dName. 846 dLeft = mid + 1 847 case -1: 848 // name < dName. 849 dRight = mid - 1 850 } 851 } 852 853 return 0, linuxerr.ENOENT 854 } 855 856 // IterDirents invokes cb on each entry in the directory represented by this inode. 857 // The directory entries will be iterated in alphabetical order. 858 func (i *Inode) IterDirents(cb func(name string, typ uint8, nid uint64) error) error { 859 if !i.IsDir() { 860 return linuxerr.ENOTDIR 861 } 862 863 // Iterate all the blocks which contain dirents. 864 for blockIdx := uint64(0); blockIdx < i.blocks; blockIdx++ { 865 block := i.getBlockDataInfo(blockIdx) 866 d, err := i.getDirent0(block) 867 if err != nil { 868 return err 869 } 870 // Iterate all the dirents in this block. 871 numDirents := d.NameOff / DirentSize 872 for { 873 name, err := i.getDirentName(d, block, numDirents == 1) 874 if err != nil { 875 return err 876 } 877 if err := cb(string(name), d.FileType, d.Nid()); err != nil { 878 return err 879 } 880 if numDirents--; numDirents == 0 { 881 break 882 } 883 d = direntAfter(d) 884 } 885 } 886 return nil 887 } 888 889 // Readlink reads the link target. 890 func (i *Inode) Readlink() (string, error) { 891 if !i.IsSymlink() { 892 return "", linuxerr.EINVAL 893 } 894 off := i.dataOff 895 size := i.size 896 if i.idataOff != 0 { 897 // Inline symlink data shouldn't cross block boundary. 898 if i.blocks > 1 { 899 log.Warningf("Inline data cross block boundary at inode (nid=%v)", i.Nid()) 900 return "", linuxerr.EUCLEAN 901 } 902 off = i.idataOff 903 } else { 904 // This matches Linux's behaviour in fs/namei.c:page_get_link() and 905 // include/linux/namei.h:nd_terminate_link(). 906 if size > hostarch.PageSize-1 { 907 size = hostarch.PageSize - 1 908 } 909 } 910 target, err := i.image.BytesAt(off, size) 911 if err != nil { 912 return "", err 913 } 914 return string(target), nil 915 }