github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/vfs/syncing_file_linux.go (about)

     1  // Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  //go:build linux && !arm
     6  // +build linux,!arm
     7  
     8  package vfs
     9  
    10  import "syscall"
    11  
    12  type syncFileRange func(fd int, off int64, n int64, flags int) (err error)
    13  
    14  // sync_file_range depends on both the filesystem, and the broader kernel
    15  // support. In particular, Windows Subsystem for Linux does not support
    16  // sync_file_range, even when used with ext{2,3,4}. syncRangeSmokeTest performs
    17  // a test of of sync_file_range, returning false on ENOSYS, and true otherwise.
    18  func syncRangeSmokeTest(fd uintptr, fn syncFileRange) bool {
    19  	err := fn(int(fd), 0 /* offset */, 0 /* nbytes */, 0 /* flags */)
    20  	return err != syscall.ENOSYS
    21  }
    22  
    23  func isSyncRangeSupported(fd uintptr) bool {
    24  	var stat syscall.Statfs_t
    25  	if err := syscall.Fstatfs(int(fd), &stat); err != nil {
    26  		return false
    27  	}
    28  
    29  	// Allowlist which filesystems we allow using sync_file_range with as some
    30  	// filesystems treat that syscall as a noop (notably ZFS). A allowlist is
    31  	// used instead of a denylist in order to have a more graceful failure mode
    32  	// in case a filesystem we haven't tested is encountered. Currently only
    33  	// ext2/3/4 are known to work properly.
    34  	const extMagic = 0xef53
    35  	switch stat.Type {
    36  	case extMagic:
    37  		return syncRangeSmokeTest(fd, syscall.SyncFileRange)
    38  	}
    39  	return false
    40  }
    41  
    42  func (f *syncingFile) init() {
    43  	if f.fd == 0 {
    44  		return
    45  	}
    46  	f.timeDiskOp(func() {
    47  		f.useSyncRange = isSyncRangeSupported(f.fd)
    48  	})
    49  	if f.useSyncRange {
    50  		f.syncTo = f.syncToRange
    51  	} else {
    52  		f.syncTo = f.syncToFdatasync
    53  	}
    54  	f.syncData = f.syncFdatasync
    55  }
    56  
    57  func (f *syncingFile) syncFdatasync() error {
    58  	if f.fd == 0 {
    59  		return f.File.Sync()
    60  	}
    61  	var err error
    62  	f.timeDiskOp(func() {
    63  		err = syscall.Fdatasync(int(f.fd))
    64  	})
    65  	return err
    66  }
    67  
    68  func (f *syncingFile) syncToFdatasync(_ int64) error {
    69  	return f.Sync()
    70  }
    71  
    72  func (f *syncingFile) syncToRange(offset int64) error {
    73  	const (
    74  		waitBefore = 0x1
    75  		write      = 0x2
    76  		// waitAfter = 0x4
    77  	)
    78  
    79  	// The flags for the sync_file_range system call. Unless the file has
    80  	// noSyncOnClose explicitly set and it is being closed, the waitBefore
    81  	// flag will be set which may block the call.
    82  	flags := write
    83  	if !f.noSyncOnClose || !f.closing {
    84  		flags |= waitBefore
    85  	}
    86  
    87  	// Note that syncToRange is only called with an offset that is guaranteed to
    88  	// be less than atomic.offset (i.e. the write offset). This implies the
    89  	// syncingFile.Close will Sync the rest of the data, as well as the file's
    90  	// metadata.
    91  	f.ratchetSyncOffset(offset)
    92  
    93  	// By specifying write|waitBefore for the flags, we're instructing
    94  	// SyncFileRange to a) wait for any outstanding data being written to finish,
    95  	// and b) to queue any other dirty data blocks in the range [0,offset] for
    96  	// writing. The actual writing of this data will occur asynchronously. The
    97  	// use of `waitBefore` is to limit how much dirty data is allowed to
    98  	// accumulate. Linux sometimes behaves poorly when a large amount of dirty
    99  	// data accumulates, impacting other I/O operations.
   100  	var err error
   101  	f.timeDiskOp(func() {
   102  		err = syscall.SyncFileRange(int(f.fd), 0, offset, flags)
   103  	})
   104  	return err
   105  }