github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/vfs/default_linux.go (about)

     1  // Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  //go:build linux
     6  // +build linux
     7  
     8  package vfs
     9  
    10  import (
    11  	"os"
    12  	"syscall"
    13  
    14  	"github.com/cockroachdb/errors"
    15  	"golang.org/x/sys/unix"
    16  )
    17  
    18  func wrapOSFileImpl(f *os.File) File {
    19  	lf := &linuxFile{File: f, fd: f.Fd()}
    20  	if lf.fd != InvalidFd {
    21  		lf.useSyncRange = isSyncRangeSupported(lf.fd)
    22  	}
    23  	return lf
    24  }
    25  
    26  func (defaultFS) OpenDir(name string) (File, error) {
    27  	f, err := os.OpenFile(name, syscall.O_CLOEXEC, 0)
    28  	if err != nil {
    29  		return nil, errors.WithStack(err)
    30  	}
    31  	return &linuxDir{f}, nil
    32  }
    33  
    34  // Assert that linuxFile and linuxDir implement vfs.File.
    35  var (
    36  	_ File = (*linuxDir)(nil)
    37  	_ File = (*linuxFile)(nil)
    38  )
    39  
    40  type linuxDir struct {
    41  	*os.File
    42  }
    43  
    44  func (d *linuxDir) Prefetch(offset int64, length int64) error      { return nil }
    45  func (d *linuxDir) Preallocate(offset, length int64) error         { return nil }
    46  func (d *linuxDir) SyncData() error                                { return d.Sync() }
    47  func (d *linuxDir) SyncTo(offset int64) (fullSync bool, err error) { return false, nil }
    48  
    49  type linuxFile struct {
    50  	*os.File
    51  	fd           uintptr
    52  	useSyncRange bool
    53  }
    54  
    55  func (f *linuxFile) Prefetch(offset int64, length int64) error {
    56  	_, _, err := unix.Syscall(unix.SYS_READAHEAD, uintptr(f.fd), uintptr(offset), uintptr(length))
    57  	return err
    58  }
    59  
    60  func (f *linuxFile) Preallocate(offset, length int64) error {
    61  	return unix.Fallocate(int(f.fd), unix.FALLOC_FL_KEEP_SIZE, offset, length)
    62  }
    63  
    64  func (f *linuxFile) SyncData() error {
    65  	return unix.Fdatasync(int(f.fd))
    66  }
    67  
    68  func (f *linuxFile) SyncTo(offset int64) (fullSync bool, err error) {
    69  	if !f.useSyncRange {
    70  		// Use fdatasync, which does provide persistence guarantees but won't
    71  		// update all file metadata. From the `fdatasync` man page:
    72  		//
    73  		// fdatasync() is similar to fsync(), but does not flush modified
    74  		// metadata unless that metadata is needed in order to allow a
    75  		// subsequent data retrieval to be correctly handled. For example,
    76  		// changes to st_atime or st_mtime (respectively, time of last access
    77  		// and time of last modification; see stat(2)) do not require flushing
    78  		// because they are not necessary for a subsequent data read to be
    79  		// handled correctly. On the other hand, a change to the file size
    80  		// (st_size, as made by say ftruncate(2)), would require a metadata
    81  		// flush.
    82  		if err = unix.Fdatasync(int(f.fd)); err != nil {
    83  			return false, err
    84  		}
    85  		return true, nil
    86  	}
    87  
    88  	const (
    89  		waitBefore = 0x1
    90  		write      = 0x2
    91  		// waitAfter = 0x4
    92  	)
    93  
    94  	// By specifying write|waitBefore for the flags, we're instructing
    95  	// SyncFileRange to a) wait for any outstanding data being written to finish,
    96  	// and b) to queue any other dirty data blocks in the range [0,offset] for
    97  	// writing. The actual writing of this data will occur asynchronously. The
    98  	// use of `waitBefore` is to limit how much dirty data is allowed to
    99  	// accumulate. Linux sometimes behaves poorly when a large amount of dirty
   100  	// data accumulates, impacting other I/O operations.
   101  	return false, unix.SyncFileRange(int(f.fd), 0, offset, write|waitBefore)
   102  }
   103  
   104  type syncFileRange func(fd int, off int64, n int64, flags int) (err error)
   105  
   106  // sync_file_range depends on both the filesystem, and the broader kernel
   107  // support. In particular, Windows Subsystem for Linux does not support
   108  // sync_file_range, even when used with ext{2,3,4}. syncRangeSmokeTest performs
   109  // a test of of sync_file_range, returning false on ENOSYS, and true otherwise.
   110  func syncRangeSmokeTest(fd uintptr, syncFn syncFileRange) bool {
   111  	err := syncFn(int(fd), 0 /* offset */, 0 /* nbytes */, 0 /* flags */)
   112  	return err != unix.ENOSYS
   113  }
   114  
   115  func isSyncRangeSupported(fd uintptr) bool {
   116  	var stat unix.Statfs_t
   117  	if err := unix.Fstatfs(int(fd), &stat); err != nil {
   118  		return false
   119  	}
   120  
   121  	// Allowlist which filesystems we allow using sync_file_range with as some
   122  	// filesystems treat that syscall as a noop (notably ZFS). A allowlist is
   123  	// used instead of a denylist in order to have a more graceful failure mode
   124  	// in case a filesystem we haven't tested is encountered. Currently only
   125  	// ext2/3/4 are known to work properly.
   126  	const extMagic = 0xef53
   127  	switch stat.Type {
   128  	case extMagic:
   129  		return syncRangeSmokeTest(fd, unix.SyncFileRange)
   130  	}
   131  	return false
   132  }