github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/vfs/default_linux.go (about) 1 // Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 //go:build linux 6 // +build linux 7 8 package vfs 9 10 import ( 11 "os" 12 "syscall" 13 14 "github.com/cockroachdb/errors" 15 "golang.org/x/sys/unix" 16 ) 17 18 func wrapOSFileImpl(f *os.File) File { 19 lf := &linuxFile{File: f, fd: f.Fd()} 20 if lf.fd != InvalidFd { 21 lf.useSyncRange = isSyncRangeSupported(lf.fd) 22 } 23 return lf 24 } 25 26 func (defaultFS) OpenDir(name string) (File, error) { 27 f, err := os.OpenFile(name, syscall.O_CLOEXEC, 0) 28 if err != nil { 29 return nil, errors.WithStack(err) 30 } 31 return &linuxDir{f}, nil 32 } 33 34 // Assert that linuxFile and linuxDir implement vfs.File. 35 var ( 36 _ File = (*linuxDir)(nil) 37 _ File = (*linuxFile)(nil) 38 ) 39 40 type linuxDir struct { 41 *os.File 42 } 43 44 func (d *linuxDir) Prefetch(offset int64, length int64) error { return nil } 45 func (d *linuxDir) Preallocate(offset, length int64) error { return nil } 46 func (d *linuxDir) SyncData() error { return d.Sync() } 47 func (d *linuxDir) SyncTo(offset int64) (fullSync bool, err error) { return false, nil } 48 49 type linuxFile struct { 50 *os.File 51 fd uintptr 52 useSyncRange bool 53 } 54 55 func (f *linuxFile) Prefetch(offset int64, length int64) error { 56 _, _, err := unix.Syscall(unix.SYS_READAHEAD, uintptr(f.fd), uintptr(offset), uintptr(length)) 57 return err 58 } 59 60 func (f *linuxFile) Preallocate(offset, length int64) error { 61 return unix.Fallocate(int(f.fd), unix.FALLOC_FL_KEEP_SIZE, offset, length) 62 } 63 64 func (f *linuxFile) SyncData() error { 65 return unix.Fdatasync(int(f.fd)) 66 } 67 68 func (f *linuxFile) SyncTo(offset int64) (fullSync bool, err error) { 69 if !f.useSyncRange { 70 // Use fdatasync, which does provide persistence guarantees but won't 71 // update all file metadata. From the `fdatasync` man page: 72 // 73 // fdatasync() is similar to fsync(), but does not flush modified 74 // metadata unless that metadata is needed in order to allow a 75 // subsequent data retrieval to be correctly handled. For example, 76 // changes to st_atime or st_mtime (respectively, time of last access 77 // and time of last modification; see stat(2)) do not require flushing 78 // because they are not necessary for a subsequent data read to be 79 // handled correctly. On the other hand, a change to the file size 80 // (st_size, as made by say ftruncate(2)), would require a metadata 81 // flush. 82 if err = unix.Fdatasync(int(f.fd)); err != nil { 83 return false, err 84 } 85 return true, nil 86 } 87 88 const ( 89 waitBefore = 0x1 90 write = 0x2 91 // waitAfter = 0x4 92 ) 93 94 // By specifying write|waitBefore for the flags, we're instructing 95 // SyncFileRange to a) wait for any outstanding data being written to finish, 96 // and b) to queue any other dirty data blocks in the range [0,offset] for 97 // writing. The actual writing of this data will occur asynchronously. The 98 // use of `waitBefore` is to limit how much dirty data is allowed to 99 // accumulate. Linux sometimes behaves poorly when a large amount of dirty 100 // data accumulates, impacting other I/O operations. 101 return false, unix.SyncFileRange(int(f.fd), 0, offset, write|waitBefore) 102 } 103 104 type syncFileRange func(fd int, off int64, n int64, flags int) (err error) 105 106 // sync_file_range depends on both the filesystem, and the broader kernel 107 // support. In particular, Windows Subsystem for Linux does not support 108 // sync_file_range, even when used with ext{2,3,4}. syncRangeSmokeTest performs 109 // a test of of sync_file_range, returning false on ENOSYS, and true otherwise. 110 func syncRangeSmokeTest(fd uintptr, syncFn syncFileRange) bool { 111 err := syncFn(int(fd), 0 /* offset */, 0 /* nbytes */, 0 /* flags */) 112 return err != unix.ENOSYS 113 } 114 115 func isSyncRangeSupported(fd uintptr) bool { 116 var stat unix.Statfs_t 117 if err := unix.Fstatfs(int(fd), &stat); err != nil { 118 return false 119 } 120 121 // Allowlist which filesystems we allow using sync_file_range with as some 122 // filesystems treat that syscall as a noop (notably ZFS). A allowlist is 123 // used instead of a denylist in order to have a more graceful failure mode 124 // in case a filesystem we haven't tested is encountered. Currently only 125 // ext2/3/4 are known to work properly. 126 const extMagic = 0xef53 127 switch stat.Type { 128 case extMagic: 129 return syncRangeSmokeTest(fd, unix.SyncFileRange) 130 } 131 return false 132 }