github.com/zuoyebang/bitalosdb@v1.1.1-0.20240516111551-79a8c4d8ce20/internal/vfs/syncing_file_linux.go (about)

     1  // Copyright 2021 The Bitalosdb author(hustxrb@163.com) and other contributors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //go:build linux && !arm
    16  
    17  package vfs
    18  
    19  import "syscall"
    20  
    21  type syncFileRange func(fd int, off int64, n int64, flags int) (err error)
    22  
    23  // sync_file_range depends on both the filesystem, and the broader kernel
    24  // support. In particular, Windows Subsystem for Linux does not support
    25  // sync_file_range, even when used with ext{2,3,4}. syncRangeSmokeTest performs
    26  // a test of of sync_file_range, returning false on ENOSYS, and true otherwise.
    27  func syncRangeSmokeTest(fd uintptr, fn syncFileRange) bool {
    28  	err := fn(int(fd), 0 /* offset */, 0 /* nbytes */, 0 /* flags */)
    29  	return err != syscall.ENOSYS
    30  }
    31  
    32  func isSyncRangeSupported(fd uintptr) bool {
    33  	var stat syscall.Statfs_t
    34  	if err := syscall.Fstatfs(int(fd), &stat); err != nil {
    35  		return false
    36  	}
    37  
    38  	// Allowlist which filesystems we allow using sync_file_range with as some
    39  	// filesystems treat that syscall as a noop (notably ZFS). A allowlist is
    40  	// used instead of a denylist in order to have a more graceful failure mode
    41  	// in case a filesystem we haven't tested is encountered. Currently only
    42  	// ext2/3/4 are known to work properly.
    43  	const extMagic = 0xef53
    44  	switch stat.Type {
    45  	case extMagic:
    46  		return syncRangeSmokeTest(fd, syscall.SyncFileRange)
    47  	}
    48  	return false
    49  }
    50  
    51  func (f *syncingFile) init() {
    52  	if f.fd == 0 {
    53  		return
    54  	}
    55  	f.timeDiskOp(func() {
    56  		f.useSyncRange = isSyncRangeSupported(f.fd)
    57  	})
    58  	if f.useSyncRange {
    59  		f.syncTo = f.syncToRange
    60  	} else {
    61  		f.syncTo = f.syncToFdatasync
    62  	}
    63  	f.syncData = f.syncFdatasync
    64  }
    65  
    66  func (f *syncingFile) syncFdatasync() error {
    67  	if f.fd == 0 {
    68  		return f.File.Sync()
    69  	}
    70  	var err error
    71  	f.timeDiskOp(func() {
    72  		err = syscall.Fdatasync(int(f.fd))
    73  	})
    74  	return err
    75  }
    76  
    77  func (f *syncingFile) syncToFdatasync(_ int64) error {
    78  	return f.Sync()
    79  }
    80  
    81  func (f *syncingFile) syncToRange(offset int64) error {
    82  	const (
    83  		waitBefore = 0x1
    84  		write      = 0x2
    85  		// waitAfter = 0x4
    86  	)
    87  
    88  	// Note that syncToRange is only called with an offset that is guaranteed to
    89  	// be less than atomic.offset (i.e. the write offset). This implies the
    90  	// syncingFile.Close will Sync the rest of the data, as well as the file's
    91  	// metadata.
    92  	f.ratchetSyncOffset(offset)
    93  
    94  	// By specifying write|waitBefore for the flags, we're instructing
    95  	// SyncFileRange to a) wait for any outstanding data being written to finish,
    96  	// and b) to queue any other dirty data blocks in the range [0,offset] for
    97  	// writing. The actual writing of this data will occur asynchronously. The
    98  	// use of `waitBefore` is to limit how much dirty data is allowed to
    99  	// accumulate. Linux sometimes behaves poorly when a large amount of dirty
   100  	// data accumulates, impacting other I/O operations.
   101  	var err error
   102  	f.timeDiskOp(func() {
   103  		err = syscall.SyncFileRange(int(f.fd), 0, offset, write|waitBefore)
   104  	})
   105  	return err
   106  }