gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/erofs/erofs.go (about)

     1  // Copyright 2023 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package erofs provides the ability to access the contents in an EROFS [1] image.
    16  //
    17  // The design principle of this package is that, it will just provide the ability
    18  // to access the contents in the image, and it will never cache any objects internally.
    19  // The whole disk image is mapped via a read-only/shared mapping, and it relies on
    20  // host kernel to cache the blocks/pages transparently.
    21  //
    22  // [1] https://docs.kernel.org/filesystems/erofs.html
    23  package erofs
    24  
    25  import (
    26  	"bytes"
    27  	"fmt"
    28  	"hash/crc32"
    29  	"os"
    30  
    31  	"golang.org/x/sys/unix"
    32  	"gvisor.dev/gvisor/pkg/abi/linux"
    33  	"gvisor.dev/gvisor/pkg/cleanup"
    34  	"gvisor.dev/gvisor/pkg/errors/linuxerr"
    35  	"gvisor.dev/gvisor/pkg/gohacks"
    36  	"gvisor.dev/gvisor/pkg/hostarch"
    37  	"gvisor.dev/gvisor/pkg/log"
    38  	"gvisor.dev/gvisor/pkg/marshal"
    39  	"gvisor.dev/gvisor/pkg/safemem"
    40  )
    41  
    42  const (
    43  	// Definitions for superblock.
    44  	SuperBlockMagicV1 = 0xe0f5e1e2
    45  	SuperBlockOffset  = 1024
    46  
    47  	// Inode slot size in bit shift.
    48  	InodeSlotBits = 5
    49  
    50  	// Max file name length.
    51  	MaxNameLen = 255
    52  )
    53  
    54  // Bit definitions for Inode*::Format.
    55  const (
    56  	InodeLayoutBit  = 0
    57  	InodeLayoutBits = 1
    58  
    59  	InodeDataLayoutBit  = 1
    60  	InodeDataLayoutBits = 3
    61  )
    62  
    63  // Inode layouts.
    64  const (
    65  	InodeLayoutCompact  = 0
    66  	InodeLayoutExtended = 1
    67  )
    68  
    69  // Inode data layouts.
    70  const (
    71  	InodeDataLayoutFlatPlain = iota
    72  	InodeDataLayoutFlatCompressionLegacy
    73  	InodeDataLayoutFlatInline
    74  	InodeDataLayoutFlatCompression
    75  	InodeDataLayoutChunkBased
    76  	InodeDataLayoutMax
    77  )
    78  
    79  // Features w/ backward compatibility.
    80  // This is not exhaustive, unused features are not listed.
    81  const (
    82  	FeatureCompatSuperBlockChecksum = 0x00000001
    83  )
    84  
    85  // Features w/o backward compatibility.
    86  //
    87  // Any features that aren't in FeatureIncompatSupported are incompatible
    88  // with this implementation.
    89  //
    90  // This is not exhaustive, unused features are not listed.
    91  const (
    92  	FeatureIncompatSupported = 0x0
    93  )
    94  
    95  // Sizes of on-disk structures in bytes.
    96  const (
    97  	SuperBlockSize    = 128
    98  	InodeCompactSize  = 32
    99  	InodeExtendedSize = 64
   100  	DirentSize        = 12
   101  )
   102  
   103  // SuperBlock represents on-disk superblock.
   104  //
   105  // +marshal
   106  // +stateify savable
   107  type SuperBlock struct {
   108  	Magic           uint32
   109  	Checksum        uint32
   110  	FeatureCompat   uint32
   111  	BlockSizeBits   uint8
   112  	ExtSlots        uint8
   113  	RootNid         uint16
   114  	Inodes          uint64
   115  	BuildTime       uint64
   116  	BuildTimeNsec   uint32
   117  	Blocks          uint32
   118  	MetaBlockAddr   uint32
   119  	XattrBlockAddr  uint32
   120  	UUID            [16]uint8
   121  	VolumeName      [16]uint8
   122  	FeatureIncompat uint32
   123  	Union1          uint16
   124  	ExtraDevices    uint16
   125  	DevTableSlotOff uint16
   126  	Reserved        [38]uint8
   127  }
   128  
   129  // BlockSize returns the block size.
   130  func (sb *SuperBlock) BlockSize() uint32 {
   131  	return 1 << sb.BlockSizeBits
   132  }
   133  
   134  // BlockAddrToOffset converts block addr to the offset in image file.
   135  func (sb *SuperBlock) BlockAddrToOffset(addr uint32) uint64 {
   136  	return uint64(addr) << sb.BlockSizeBits
   137  }
   138  
   139  // MetaOffset returns the offset of metadata area in image file.
   140  func (sb *SuperBlock) MetaOffset() uint64 {
   141  	return sb.BlockAddrToOffset(sb.MetaBlockAddr)
   142  }
   143  
   144  // NidToOffset converts inode number to the offset in image file.
   145  func (sb *SuperBlock) NidToOffset(nid uint64) uint64 {
   146  	return sb.MetaOffset() + (nid << InodeSlotBits)
   147  }
   148  
   149  // InodeCompact represents 32-byte reduced form of on-disk inode.
   150  //
   151  // +marshal
   152  type InodeCompact struct {
   153  	Format       uint16
   154  	XattrCount   uint16
   155  	Mode         uint16
   156  	Nlink        uint16
   157  	Size         uint32
   158  	Reserved     uint32
   159  	RawBlockAddr uint32
   160  	Ino          uint32
   161  	UID          uint16
   162  	GID          uint16
   163  	Reserved2    uint32
   164  }
   165  
   166  // InodeExtended represents 64-byte complete form of on-disk inode.
   167  //
   168  // +marshal
   169  type InodeExtended struct {
   170  	Format       uint16
   171  	XattrCount   uint16
   172  	Mode         uint16
   173  	Reserved     uint16
   174  	Size         uint64
   175  	RawBlockAddr uint32
   176  	Ino          uint32
   177  	UID          uint32
   178  	GID          uint32
   179  	Mtime        uint64
   180  	MtimeNsec    uint32
   181  	Nlink        uint32
   182  	Reserved2    [16]uint8
   183  }
   184  
   185  // Dirent represents on-disk directory entry.
   186  //
   187  // +marshal
   188  type Dirent struct {
   189  	NidLow   uint32
   190  	NidHigh  uint32
   191  	NameOff  uint16
   192  	FileType uint8
   193  	Reserved uint8
   194  }
   195  
   196  // Nid returns the inode number of the inode referenced by this dirent.
   197  func (d *Dirent) Nid() uint64 {
   198  	// EROFS on-disk structures are always in little endian.
   199  	// TODO: This implementation does not support big endian yet.
   200  	return (uint64(d.NidHigh) << 32) | uint64(d.NidLow)
   201  }
   202  
   203  // Image represents an open EROFS image.
   204  //
   205  // +stateify savable
   206  type Image struct {
   207  	src   *os.File `state:"nosave"`
   208  	bytes []byte   `state:"nosave"`
   209  	sb    SuperBlock
   210  }
   211  
   212  // OpenImage returns an Image providing access to the contents in the image file src.
   213  //
   214  // On success, the ownership of src is transferred to Image.
   215  func OpenImage(src *os.File) (*Image, error) {
   216  	i := &Image{src: src}
   217  
   218  	var cu cleanup.Cleanup
   219  	defer cu.Clean()
   220  
   221  	stat, err := i.src.Stat()
   222  	if err != nil {
   223  		return nil, err
   224  	}
   225  	i.bytes, err = unix.Mmap(int(i.src.Fd()), 0, int(stat.Size()), unix.PROT_READ, unix.MAP_SHARED)
   226  	if err != nil {
   227  		return nil, err
   228  	}
   229  	cu.Add(func() { unix.Munmap(i.bytes) })
   230  
   231  	if err := i.initSuperBlock(); err != nil {
   232  		return nil, err
   233  	}
   234  	cu.Release()
   235  	return i, nil
   236  }
   237  
   238  // Close closes the image.
   239  func (i *Image) Close() {
   240  	unix.Munmap(i.bytes)
   241  	i.src.Close()
   242  }
   243  
   244  // SuperBlock returns a copy of the image's superblock.
   245  func (i *Image) SuperBlock() SuperBlock {
   246  	return i.sb
   247  }
   248  
   249  // BlockSize returns the block size of this image.
   250  func (i *Image) BlockSize() uint32 {
   251  	return i.sb.BlockSize()
   252  }
   253  
   254  // Blocks returns the total blocks of this image.
   255  func (i *Image) Blocks() uint32 {
   256  	return i.sb.Blocks
   257  }
   258  
   259  // RootNid returns the root inode number of this image.
   260  func (i *Image) RootNid() uint64 {
   261  	return uint64(i.sb.RootNid)
   262  }
   263  
   264  // initSuperBlock initializes the superblock of this image.
   265  func (i *Image) initSuperBlock() error {
   266  	// i.sb is used in the hot path. Let's save a copy of the superblock.
   267  	if err := i.unmarshalAt(&i.sb, SuperBlockOffset); err != nil {
   268  		return fmt.Errorf("image size is too small")
   269  	}
   270  
   271  	if i.sb.Magic != SuperBlockMagicV1 {
   272  		return fmt.Errorf("unknown magic: 0x%x", i.sb.Magic)
   273  	}
   274  
   275  	if err := i.verifyChecksum(); err != nil {
   276  		return err
   277  	}
   278  
   279  	if featureIncompat := i.sb.FeatureIncompat & ^uint32(FeatureIncompatSupported); featureIncompat != 0 {
   280  		return fmt.Errorf("unsupported incompatible features detected: 0x%x", featureIncompat)
   281  	}
   282  
   283  	if i.BlockSize()%hostarch.PageSize != 0 {
   284  		return fmt.Errorf("unsupported block size: 0x%x", i.BlockSize())
   285  	}
   286  
   287  	return nil
   288  }
   289  
   290  // verifyChecksum verifies the checksum of the superblock.
   291  func (i *Image) verifyChecksum() error {
   292  	if i.sb.FeatureCompat&FeatureCompatSuperBlockChecksum == 0 {
   293  		return nil
   294  	}
   295  
   296  	sb := i.sb
   297  	sb.Checksum = 0
   298  	table := crc32.MakeTable(crc32.Castagnoli)
   299  	checksum := crc32.Checksum(marshal.Marshal(&sb), table)
   300  
   301  	off := SuperBlockOffset + uint64(i.sb.SizeBytes())
   302  	if bytes, err := i.BytesAt(off, uint64(i.BlockSize())-off); err != nil {
   303  		return fmt.Errorf("image size is too small")
   304  	} else {
   305  		checksum = ^crc32.Update(checksum, table, bytes)
   306  	}
   307  	if checksum != i.sb.Checksum {
   308  		return fmt.Errorf("invalid checksum: 0x%x, expected: 0x%x", checksum, i.sb.Checksum)
   309  	}
   310  
   311  	return nil
   312  }
   313  
   314  // FD returns the host FD of underlying image file.
   315  func (i *Image) FD() int {
   316  	return int(i.src.Fd())
   317  }
   318  
   319  // checkRange checks whether the range [off, off+n) is valid.
   320  func (i *Image) checkRange(off, n uint64) bool {
   321  	size := uint64(len(i.bytes))
   322  	end := off + n
   323  	return off < size && off <= end && end <= size
   324  }
   325  
   326  // BytesAt returns the bytes at [off, off+n) of the image.
   327  func (i *Image) BytesAt(off, n uint64) ([]byte, error) {
   328  	if ok := i.checkRange(off, n); !ok {
   329  		log.Warningf("Invalid byte range (off: 0x%x, n: 0x%x) for image (size: 0x%x)", off, n, len(i.bytes))
   330  		return nil, linuxerr.EFAULT
   331  	}
   332  	return i.bytes[off : off+n], nil
   333  }
   334  
   335  // checkInodeAlignment checks whether off matches inode's alignment requirement.
   336  func checkInodeAlignment(off uint64) bool {
   337  	// Each valid inode should be aligned with an inode slot, which is
   338  	// a fixed value (32 bytes).
   339  	return off&((1<<InodeSlotBits)-1) == 0
   340  }
   341  
   342  // inodeFormatAt returns the format of the inode at offset off within the
   343  // memory backed by image.
   344  func (i *Image) inodeFormatAt(off uint64) (uint16, error) {
   345  	if ok := checkInodeAlignment(off); !ok {
   346  		return 0, linuxerr.EFAULT
   347  	}
   348  	if ok := i.checkRange(off, 2); !ok {
   349  		return 0, linuxerr.EFAULT
   350  	}
   351  	return *(*uint16)(i.pointerAt(off)), nil
   352  }
   353  
   354  // inodeCompactAt returns a pointer to the compact inode at offset off within
   355  // the memory backed by image.
   356  func (i *Image) inodeCompactAt(off uint64) (*InodeCompact, error) {
   357  	if ok := checkInodeAlignment(off); !ok {
   358  		return nil, linuxerr.EFAULT
   359  	}
   360  	if ok := i.checkRange(off, InodeCompactSize); !ok {
   361  		return nil, linuxerr.EFAULT
   362  	}
   363  	return (*InodeCompact)(i.pointerAt(off)), nil
   364  }
   365  
   366  // inodeExtendedAt returns a pointer to the extended inode at offset off within
   367  // the memory backed by image.
   368  func (i *Image) inodeExtendedAt(off uint64) (*InodeExtended, error) {
   369  	if ok := checkInodeAlignment(off); !ok {
   370  		return nil, linuxerr.EFAULT
   371  	}
   372  	if ok := i.checkRange(off, InodeExtendedSize); !ok {
   373  		return nil, linuxerr.EFAULT
   374  	}
   375  	return (*InodeExtended)(i.pointerAt(off)), nil
   376  }
   377  
   378  // direntAt returns a pointer to the dirent at offset off within the memory
   379  // backed by image.
   380  func (i *Image) direntAt(off uint64) (*Dirent, error) {
   381  	// Each valid dirent should be aligned to 4 bytes.
   382  	if off&3 != 0 {
   383  		return nil, linuxerr.EFAULT
   384  	}
   385  	if ok := i.checkRange(off, DirentSize); !ok {
   386  		return nil, linuxerr.EFAULT
   387  	}
   388  	return (*Dirent)(i.pointerAt(off)), nil
   389  }
   390  
   391  // unmarshalAt deserializes data from the bytes at [off, off+n) of the image.
   392  func (i *Image) unmarshalAt(data marshal.Marshallable, off uint64) error {
   393  	bytes, err := i.BytesAt(off, uint64(data.SizeBytes()))
   394  	if err != nil {
   395  		log.Warningf("Failed to deserialize %T from 0x%x.", data, off)
   396  		return err
   397  	}
   398  	data.UnmarshalUnsafe(bytes)
   399  	return nil
   400  }
   401  
   402  // Inode returns the inode identified by nid.
   403  func (i *Image) Inode(nid uint64) (Inode, error) {
   404  	inode := Inode{
   405  		image: i,
   406  		nid:   nid,
   407  	}
   408  
   409  	off := i.sb.NidToOffset(nid)
   410  	if format, err := i.inodeFormatAt(off); err != nil {
   411  		return Inode{}, err
   412  	} else {
   413  		inode.format = format
   414  	}
   415  
   416  	var (
   417  		rawBlockAddr uint32
   418  		inodeSize    int
   419  	)
   420  
   421  	switch layout := inode.Layout(); layout {
   422  	case InodeLayoutCompact:
   423  		ino, err := i.inodeCompactAt(off)
   424  		if err != nil {
   425  			return Inode{}, err
   426  		}
   427  
   428  		if ino.XattrCount != 0 {
   429  			log.Warningf("Unsupported xattr at inode (nid=%v)", nid)
   430  			return Inode{}, linuxerr.ENOTSUP
   431  		}
   432  
   433  		rawBlockAddr = ino.RawBlockAddr
   434  		inodeSize = ino.SizeBytes()
   435  
   436  		inode.size = uint64(ino.Size)
   437  		inode.nlink = uint32(ino.Nlink)
   438  		inode.mode = ino.Mode
   439  		inode.uid = uint32(ino.UID)
   440  		inode.gid = uint32(ino.GID)
   441  		inode.mtime = i.sb.BuildTime
   442  		inode.mtimeNsec = i.sb.BuildTimeNsec
   443  
   444  	case InodeLayoutExtended:
   445  		ino, err := i.inodeExtendedAt(off)
   446  		if err != nil {
   447  			return Inode{}, err
   448  		}
   449  
   450  		if ino.XattrCount != 0 {
   451  			log.Warningf("Unsupported xattr at inode (nid=%v)", nid)
   452  			return Inode{}, linuxerr.ENOTSUP
   453  		}
   454  
   455  		rawBlockAddr = ino.RawBlockAddr
   456  		inodeSize = ino.SizeBytes()
   457  
   458  		inode.size = ino.Size
   459  		inode.nlink = ino.Nlink
   460  		inode.mode = ino.Mode
   461  		inode.uid = ino.UID
   462  		inode.gid = ino.GID
   463  		inode.mtime = ino.Mtime
   464  		inode.mtimeNsec = ino.MtimeNsec
   465  
   466  	default:
   467  		log.Warningf("Unsupported layout 0x%x at inode (nid=%v)", layout, nid)
   468  		return Inode{}, linuxerr.ENOTSUP
   469  	}
   470  
   471  	blockSize := uint64(i.BlockSize())
   472  	inode.blocks = (inode.size + (blockSize - 1)) / blockSize
   473  
   474  	switch dataLayout := inode.DataLayout(); dataLayout {
   475  	case InodeDataLayoutFlatInline:
   476  		// Check that whether the file data in the last block fits into
   477  		// the remaining room of the metadata block.
   478  		tailSize := inode.size & (blockSize - 1)
   479  		if tailSize == 0 || tailSize > blockSize-uint64(inodeSize) {
   480  			log.Warningf("Inline data not found or cross block boundary at inode (nid=%v)", nid)
   481  			return Inode{}, linuxerr.EUCLEAN
   482  		}
   483  		inode.idataOff = off + uint64(inodeSize)
   484  		fallthrough
   485  
   486  	case InodeDataLayoutFlatPlain:
   487  		inode.dataOff = i.sb.BlockAddrToOffset(rawBlockAddr)
   488  
   489  	default:
   490  		log.Warningf("Unsupported data layout 0x%x at inode (nid=%v)", dataLayout, nid)
   491  		return Inode{}, linuxerr.ENOTSUP
   492  	}
   493  
   494  	return inode, nil
   495  }
   496  
   497  // Inode represents in-memory inode object.
   498  //
   499  // +stateify savable
   500  type Inode struct {
   501  	// image is the underlying image. Inode should not perform writable
   502  	// operations (e.g. Close()) on the image.
   503  	image *Image
   504  
   505  	// dataOff points to the data of this inode in the data blocks.
   506  	dataOff uint64
   507  
   508  	// idataOff points to the tail packing inline data of this inode
   509  	// if it's not zero in the metadata block.
   510  	idataOff uint64
   511  
   512  	// blocks indicates the count of blocks that store the data associated
   513  	// with this inode. It will count in the metadata block that includes
   514  	// the inline data as well.
   515  	blocks uint64
   516  
   517  	// format is the format of this inode.
   518  	format uint16
   519  
   520  	// Metadata.
   521  	mode      uint16
   522  	nid       uint64
   523  	size      uint64
   524  	mtime     uint64
   525  	mtimeNsec uint32
   526  	uid       uint32
   527  	gid       uint32
   528  	nlink     uint32
   529  }
   530  
   531  // bitRange returns the bits within the range [bit, bit+bits) in value.
   532  func bitRange(value, bit, bits uint16) uint16 {
   533  	return (value >> bit) & ((1 << bits) - 1)
   534  }
   535  
   536  // Layout returns the inode layout.
   537  func (i *Inode) Layout() uint16 {
   538  	return bitRange(i.format, InodeLayoutBit, InodeLayoutBits)
   539  }
   540  
   541  // DataLayout returns the inode data layout.
   542  func (i *Inode) DataLayout() uint16 {
   543  	return bitRange(i.format, InodeDataLayoutBit, InodeDataLayoutBits)
   544  }
   545  
   546  // IsRegular indicates whether i represents a regular file.
   547  func (i *Inode) IsRegular() bool {
   548  	return i.mode&linux.S_IFMT == linux.S_IFREG
   549  }
   550  
   551  // IsDir indicates whether i represents a directory.
   552  func (i *Inode) IsDir() bool {
   553  	return i.mode&linux.S_IFMT == linux.S_IFDIR
   554  }
   555  
   556  // IsCharDev indicates whether i represents a character device.
   557  func (i *Inode) IsCharDev() bool {
   558  	return i.mode&linux.S_IFMT == linux.S_IFCHR
   559  }
   560  
   561  // IsBlockDev indicates whether i represents a block device.
   562  func (i *Inode) IsBlockDev() bool {
   563  	return i.mode&linux.S_IFMT == linux.S_IFBLK
   564  }
   565  
   566  // IsFIFO indicates whether i represents a named pipe.
   567  func (i *Inode) IsFIFO() bool {
   568  	return i.mode&linux.S_IFMT == linux.S_IFIFO
   569  }
   570  
   571  // IsSocket indicates whether i represents a socket.
   572  func (i *Inode) IsSocket() bool {
   573  	return i.mode&linux.S_IFMT == linux.S_IFSOCK
   574  }
   575  
   576  // IsSymlink indicates whether i represents a symbolic link.
   577  func (i *Inode) IsSymlink() bool {
   578  	return i.mode&linux.S_IFMT == linux.S_IFLNK
   579  }
   580  
   581  // Nid returns the inode number.
   582  func (i *Inode) Nid() uint64 {
   583  	return i.nid
   584  }
   585  
   586  // Size returns the data size.
   587  func (i *Inode) Size() uint64 {
   588  	return i.size
   589  }
   590  
   591  // Nlink returns the number of hard links.
   592  func (i *Inode) Nlink() uint32 {
   593  	return i.nlink
   594  }
   595  
   596  // Mtime returns the time of last modification.
   597  func (i *Inode) Mtime() uint64 {
   598  	return i.mtime
   599  }
   600  
   601  // MtimeNsec returns the nano second part of Mtime.
   602  func (i *Inode) MtimeNsec() uint32 {
   603  	return i.mtimeNsec
   604  }
   605  
   606  // Mode returns the file type and permissions.
   607  func (i *Inode) Mode() uint16 {
   608  	return i.mode
   609  }
   610  
   611  // UID returns the user ID of the owner.
   612  func (i *Inode) UID() uint32 {
   613  	return i.uid
   614  }
   615  
   616  // GID returns the group ID of the owner.
   617  func (i *Inode) GID() uint32 {
   618  	return i.gid
   619  }
   620  
   621  // DataOffset returns the data offset of this inode in image file.
   622  func (i *Inode) DataOffset() (uint64, error) {
   623  	// TODO: We don't support regular files with inline data yet, which means the image
   624  	// should be created with the "-E noinline_data" option. The "-E noinline_data" option
   625  	// was introduced for the DAX feature support in Linux [1].
   626  	// [1] https://github.com/erofs/erofs-utils/commit/60549d52c3b636f0ddd1d51b0c1517c1dee22595
   627  	if dataLayout := i.DataLayout(); dataLayout != InodeDataLayoutFlatPlain {
   628  		log.Warningf("Unsupported data layout 0x%x at inode (nid=%v)", dataLayout, i.Nid())
   629  		return 0, linuxerr.ENOTSUP
   630  	}
   631  	return i.dataOff, nil
   632  }
   633  
   634  // Data returns the read-only file data of this inode.
   635  func (i *Inode) Data() (safemem.BlockSeq, error) {
   636  	switch dataLayout := i.DataLayout(); dataLayout {
   637  	case InodeDataLayoutFlatPlain:
   638  		bytes, err := i.image.BytesAt(i.dataOff, i.size)
   639  		if err != nil {
   640  			return safemem.BlockSeq{}, err
   641  		}
   642  		return safemem.BlockSeqOf(safemem.BlockFromSafeSlice(bytes)), nil
   643  
   644  	case InodeDataLayoutFlatInline:
   645  		sl := make([]safemem.Block, 0, 2)
   646  		idataSize := i.size & (uint64(i.image.BlockSize()) - 1)
   647  		if i.size > idataSize {
   648  			if bytes, err := i.image.BytesAt(i.dataOff, i.size-idataSize); err != nil {
   649  				return safemem.BlockSeq{}, err
   650  			} else {
   651  				sl = append(sl, safemem.BlockFromSafeSlice(bytes))
   652  			}
   653  		}
   654  		if bytes, err := i.image.BytesAt(i.idataOff, idataSize); err != nil {
   655  			return safemem.BlockSeq{}, err
   656  		} else {
   657  			sl = append(sl, safemem.BlockFromSafeSlice(bytes))
   658  		}
   659  		return safemem.BlockSeqFromSlice(sl), nil
   660  
   661  	default:
   662  		log.Warningf("Unsupported data layout 0x%x at inode (nid=%v)", dataLayout, i.Nid())
   663  		return safemem.BlockSeq{}, linuxerr.ENOTSUP
   664  	}
   665  }
   666  
   667  // blockData represents the information of the data in a block.
   668  type blockData struct {
   669  	// base indicates the data offset within the image.
   670  	base uint64
   671  	// size indicates the data size.
   672  	size uint32
   673  }
   674  
   675  // valid indicates whether this is valid information about the data in a block.
   676  func (b *blockData) valid() bool {
   677  	// The data offset within the image will never be zero.
   678  	return b.base > 0
   679  }
   680  
   681  // getBlockDataInfo returns the information of the data in the block identified by
   682  // blockIdx of this inode.
   683  //
   684  // Precondition: blockIdx < i.blocks.
   685  func (i *Inode) getBlockDataInfo(blockIdx uint64) blockData {
   686  	blockSize := i.image.BlockSize()
   687  	lastBlock := blockIdx == i.blocks-1
   688  	base := i.idataOff
   689  	if !lastBlock || base == 0 {
   690  		base = i.dataOff + blockIdx*uint64(blockSize)
   691  	}
   692  	size := blockSize
   693  	if lastBlock {
   694  		if tailSize := uint32(i.size) & (blockSize - 1); tailSize != 0 {
   695  			size = tailSize
   696  		}
   697  	}
   698  	return blockData{base, size}
   699  }
   700  
   701  // getDirentName returns the name of dirent d in the given block of this inode.
   702  //
   703  // The on-disk format of one block looks like this:
   704  //
   705  //	                 ___________________________
   706  //	                /                           |
   707  //	               /              ______________|________________
   708  //	              /              /              | nameoff1       | nameoffN-1
   709  //	 ____________.______________._______________v________________v__________
   710  //	| dirent | dirent | ... | dirent | filename | filename | ... | filename |
   711  //	|___.0___|____1___|_____|___N-1__|____0_____|____1_____|_____|___N-1____|
   712  //	     \                           ^
   713  //	      \                          |                           * could have
   714  //	       \                         |                             trailing '\0'
   715  //	        \________________________| nameoff0
   716  //	                            Directory block
   717  //
   718  // The on-disk format of one directory looks like this:
   719  //
   720  // [ (block 1) dirent 1 | dirent 2 | dirent 3 | name 1 | name 2 | name 3 | optional padding ]
   721  // [ (block 2) dirent 4 | dirent 5 | name 4 | name 5 | optional padding ]
   722  // ...
   723  // [ (block N) dirent M | dirent M+1 | name M | name M+1 | optional padding ]
   724  //
   725  // [ (metadata block) inode | optional fields | dirent M+2 | dirent M+3 | name M+2 | name M+3 | optional padding ]
   726  //
   727  // Refer: https://docs.kernel.org/filesystems/erofs.html#directories
   728  func (i *Inode) getDirentName(d *Dirent, block blockData, lastDirent bool) ([]byte, error) {
   729  	var nameLen uint32
   730  	if lastDirent {
   731  		nameLen = block.size - uint32(d.NameOff)
   732  	} else {
   733  		nameLen = uint32(direntAfter(d).NameOff - d.NameOff)
   734  	}
   735  	if uint32(d.NameOff)+nameLen > block.size || nameLen > MaxNameLen || nameLen == 0 {
   736  		log.Warningf("Corrupted dirent at inode (nid=%v)", i.Nid())
   737  		return nil, linuxerr.EUCLEAN
   738  	}
   739  	name, err := i.image.BytesAt(block.base+uint64(d.NameOff), uint64(nameLen))
   740  	if err != nil {
   741  		return nil, err
   742  	}
   743  	if lastDirent {
   744  		// Optional padding may exist at the end of a block.
   745  		n := bytes.IndexByte(name, 0)
   746  		if n == 0 {
   747  			log.Warningf("Corrupted dirent at inode (nid=%v)", i.Nid())
   748  			return nil, linuxerr.EUCLEAN
   749  		}
   750  		if n != -1 {
   751  			name = name[:n]
   752  		}
   753  	}
   754  	return name, nil
   755  }
   756  
   757  // getDirent0 returns a pointer to the first dirent in the given block of this inode.
   758  func (i *Inode) getDirent0(block blockData) (*Dirent, error) {
   759  	d0, err := i.image.direntAt(block.base)
   760  	if err != nil {
   761  		return nil, err
   762  	}
   763  	if d0.NameOff < DirentSize || uint32(d0.NameOff) >= block.size {
   764  		log.Warningf("Invalid nameOff0 %v at inode (nid=%v)", d0.NameOff, i.Nid())
   765  		return nil, linuxerr.EUCLEAN
   766  	}
   767  	return d0, nil
   768  }
   769  
   770  // Lookup looks up a child by the name. The child inode number will be returned on success.
   771  func (i *Inode) Lookup(name string) (uint64, error) {
   772  	if !i.IsDir() {
   773  		return 0, linuxerr.ENOTDIR
   774  	}
   775  
   776  	// Currently (Go 1.21), there is no safe and efficient way to do three-way
   777  	// string comparisons, so let's convert the string to a byte slice first.
   778  	nameBytes := gohacks.ImmutableBytesFromString(name)
   779  
   780  	// In EROFS, all directory entries are _strictly_ recorded in alphabetical
   781  	// order. The lookup is done by directly performing binary search on the
   782  	// disk data similar to what Linux does in fs/erofs/namei.c:erofs_namei().
   783  	var (
   784  		targetBlock      blockData
   785  		targetNumDirents uint16
   786  	)
   787  
   788  	// Find the block that may contain the target dirent first.
   789  	bLeft, bRight := int64(0), int64(i.blocks)-1
   790  	for bLeft <= bRight {
   791  		// Cast to uint64 to avoid overflow.
   792  		mid := uint64(bLeft+bRight) >> 1
   793  		block := i.getBlockDataInfo(mid)
   794  		d0, err := i.getDirent0(block)
   795  		if err != nil {
   796  			return 0, err
   797  		}
   798  		numDirents := d0.NameOff / DirentSize
   799  		d0Name, err := i.getDirentName(d0, block, numDirents == 1)
   800  		if err != nil {
   801  			return 0, err
   802  		}
   803  		switch bytes.Compare(nameBytes, d0Name) {
   804  		case 0:
   805  			// Found the target dirent.
   806  			return d0.Nid(), nil
   807  		case 1:
   808  			// name > d0Name, this block may contain the target dirent.
   809  			targetBlock = block
   810  			targetNumDirents = numDirents
   811  			bLeft = int64(mid) + 1
   812  		case -1:
   813  			// name < d0Name, this is not the block we're looking for.
   814  			bRight = int64(mid) - 1
   815  		}
   816  	}
   817  
   818  	if !targetBlock.valid() {
   819  		// The target block was not found.
   820  		return 0, linuxerr.ENOENT
   821  	}
   822  
   823  	// Find the target dirent in the target block. Note that, as the 0th dirent
   824  	// has already been checked during the block binary search, we don't need to
   825  	// check it again and can define dLeft/dRight as unsigned types.
   826  	dLeft, dRight := uint16(1), targetNumDirents-1
   827  	for dLeft <= dRight {
   828  		// The sum will never lead to a uint16 overflow, as the maximum value of
   829  		// the operands is MaxUint16/DirentSize.
   830  		mid := (dLeft + dRight) >> 1
   831  		direntOff := targetBlock.base + uint64(mid)*DirentSize
   832  		d, err := i.image.direntAt(direntOff)
   833  		if err != nil {
   834  			return 0, err
   835  		}
   836  		dName, err := i.getDirentName(d, targetBlock, mid == targetNumDirents-1)
   837  		if err != nil {
   838  			return 0, err
   839  		}
   840  		switch bytes.Compare(nameBytes, dName) {
   841  		case 0:
   842  			// Found the target dirent.
   843  			return d.Nid(), nil
   844  		case 1:
   845  			// name > dName.
   846  			dLeft = mid + 1
   847  		case -1:
   848  			// name < dName.
   849  			dRight = mid - 1
   850  		}
   851  	}
   852  
   853  	return 0, linuxerr.ENOENT
   854  }
   855  
   856  // IterDirents invokes cb on each entry in the directory represented by this inode.
   857  // The directory entries will be iterated in alphabetical order.
   858  func (i *Inode) IterDirents(cb func(name string, typ uint8, nid uint64) error) error {
   859  	if !i.IsDir() {
   860  		return linuxerr.ENOTDIR
   861  	}
   862  
   863  	// Iterate all the blocks which contain dirents.
   864  	for blockIdx := uint64(0); blockIdx < i.blocks; blockIdx++ {
   865  		block := i.getBlockDataInfo(blockIdx)
   866  		d, err := i.getDirent0(block)
   867  		if err != nil {
   868  			return err
   869  		}
   870  		// Iterate all the dirents in this block.
   871  		numDirents := d.NameOff / DirentSize
   872  		for {
   873  			name, err := i.getDirentName(d, block, numDirents == 1)
   874  			if err != nil {
   875  				return err
   876  			}
   877  			if err := cb(string(name), d.FileType, d.Nid()); err != nil {
   878  				return err
   879  			}
   880  			if numDirents--; numDirents == 0 {
   881  				break
   882  			}
   883  			d = direntAfter(d)
   884  		}
   885  	}
   886  	return nil
   887  }
   888  
   889  // Readlink reads the link target.
   890  func (i *Inode) Readlink() (string, error) {
   891  	if !i.IsSymlink() {
   892  		return "", linuxerr.EINVAL
   893  	}
   894  	off := i.dataOff
   895  	size := i.size
   896  	if i.idataOff != 0 {
   897  		// Inline symlink data shouldn't cross block boundary.
   898  		if i.blocks > 1 {
   899  			log.Warningf("Inline data cross block boundary at inode (nid=%v)", i.Nid())
   900  			return "", linuxerr.EUCLEAN
   901  		}
   902  		off = i.idataOff
   903  	} else {
   904  		// This matches Linux's behaviour in fs/namei.c:page_get_link() and
   905  		// include/linux/namei.h:nd_terminate_link().
   906  		if size > hostarch.PageSize-1 {
   907  			size = hostarch.PageSize - 1
   908  		}
   909  	}
   910  	target, err := i.image.BytesAt(off, size)
   911  	if err != nil {
   912  		return "", err
   913  	}
   914  	return string(target), nil
   915  }