github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/fs/health/fshc.go (about) 1 // Package health provides a basic mountpath health monitor. 2 /* 3 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 4 * 5 */ 6 package health 7 8 import ( 9 "errors" 10 "fmt" 11 "io" 12 "os" 13 "path/filepath" 14 15 "github.com/NVIDIA/aistore/cmn" 16 "github.com/NVIDIA/aistore/cmn/cos" 17 "github.com/NVIDIA/aistore/cmn/nlog" 18 "github.com/NVIDIA/aistore/fs" 19 ) 20 21 const ( 22 fshcFileSize = 10 * cos.MiB // size of temporary file which will test writing and reading the mountpath 23 fshcMaxFileList = 100 // maximum number of files to read by Readdir 24 25 fshcTemp = "fshc" 26 ) 27 28 // When an IO error is triggered, it runs a few tests to make sure that the 29 // failed mountpath is healthy. Once the mountpath is considered faulty the 30 // mountpath is disabled and removed from the list. 31 // 32 // for mountpath definition, see fs/mountfs.go 33 type ( 34 fspathDispatcher interface { 35 DisableMpath(mpath, reason string) (err error) 36 } 37 FSHC struct { 38 dispatcher fspathDispatcher // listener is notified upon mountpath events (disabled, etc.) 39 fileListCh chan string 40 stopCh cos.StopCh 41 } 42 ) 43 44 ////////// 45 // FSHC // 46 ////////// 47 48 // interface guard 49 var _ cos.Runner = (*FSHC)(nil) 50 51 func NewFSHC(dispatcher fspathDispatcher) (f *FSHC) { 52 f = &FSHC{dispatcher: dispatcher, fileListCh: make(chan string, 100)} 53 f.stopCh.Init() 54 return 55 } 56 57 func (*FSHC) Name() string { return "fshc" } 58 59 func (f *FSHC) Run() error { 60 nlog.Infof("Starting %s", f.Name()) 61 62 for { 63 select { 64 case filePath := <-f.fileListCh: 65 mi, err := fs.Path2Mpath(filePath) 66 if err != nil { 67 nlog.Errorln(err) 68 break 69 } 70 71 f.runMpathTest(mi.Path, filePath) 72 case <-f.stopCh.Listen(): 73 return nil 74 } 75 } 76 } 77 78 func (f *FSHC) Stop(err error) { 79 nlog.Infof("Stopping %s, err: %v", f.Name(), err) 80 f.stopCh.Close() 81 } 82 83 func (f *FSHC) OnErr(fqn string) { 84 if !cmn.GCO.Get().FSHC.Enabled { 85 return 86 } 87 f.fileListCh <- fqn 88 } 89 90 func isTestPassed(mpath string, readErrors, writeErrors int, available bool) (passed bool, err error) { 91 config := &cmn.GCO.Get().FSHC 92 nlog.Infof("Tested mountpath %s(%v), read: %d of %d, write(size=%d): %d of %d", 93 mpath, available, 94 readErrors, config.ErrorLimit, fshcFileSize, 95 writeErrors, config.ErrorLimit) 96 97 if !available { 98 return false, errors.New("mountpath is unavailable") 99 } 100 101 passed = readErrors < config.ErrorLimit && writeErrors < config.ErrorLimit 102 if !passed { 103 err = fmt.Errorf("too many errors: %d read error%s, %d write error%s", 104 readErrors, cos.Plural(readErrors), writeErrors, cos.Plural(writeErrors)) 105 } 106 return passed, err 107 } 108 109 func (f *FSHC) runMpathTest(mpath, filepath string) { 110 var ( 111 config = cmn.GCO.Get() 112 whyFailed error 113 passed bool 114 ) 115 readErrs, writeErrs, exists := testMountpath(config, filepath, mpath, fshcFileSize) 116 if passed, whyFailed = isTestPassed(mpath, readErrs, writeErrs, exists); passed { 117 return 118 } 119 nlog.Errorf("Disabling mountpath %s...", mpath) 120 if err := f.dispatcher.DisableMpath(mpath, whyFailed.Error()); err != nil { 121 nlog.Errorf("Failed to disable mountpath: %s", err.Error()) 122 } 123 } 124 125 // reads the entire file content 126 func tryReadFile(fqn string) error { 127 file, err := fs.DirectOpen(fqn, os.O_RDONLY, 0) 128 if err != nil { 129 return err 130 } 131 if _, err := io.Copy(io.Discard, file); err != nil { 132 _ = file.Close() 133 return err 134 } 135 return file.Close() 136 } 137 138 // Creates a random file in a random directory inside a mountpath. 139 func tryWriteFile(mpath string, fileSize int64) error { 140 const ftag = "temp file" 141 // Do not test a mountpath if it is already disabled. To avoid a race 142 // when a lot of PUTs fail and each one calls FSHC, FSHC disables 143 // the mountpath on the first run, so all other tryWriteFile are redundant 144 available, disabled := fs.Get() 145 if _, ok := disabled[mpath]; ok { 146 return nil 147 } 148 mi, ok := available[mpath] 149 if !ok { 150 nlog.Warningf("Tried to write %s to non-existing mountpath %q", ftag, mpath) 151 return nil 152 } 153 154 tmpDir := mi.TempDir(fshcTemp) 155 if err := cos.CreateDir(tmpDir); err != nil { 156 return fmt.Errorf("failed to create directory %s: %w", tmpDir, err) 157 } 158 tmpFileName := filepath.Join(tmpDir, "fshc-try-write-"+cos.CryptoRandS(10)) 159 tmpFile, err := fs.DirectOpen(tmpFileName, os.O_RDWR|os.O_CREATE|os.O_TRUNC, cos.PermRWR) 160 if err != nil { 161 return fmt.Errorf("failed to create %s, err: %w", ftag, err) 162 } 163 164 defer func() { 165 if err := tmpFile.Close(); err != nil { 166 nlog.Errorf("Failed to close %s %q, err: %v", ftag, tmpFileName, err) 167 } 168 if err := cos.RemoveFile(tmpFileName); err != nil { 169 nlog.Errorf("Failed to remove %s %q, err: %v", ftag, tmpFileName, err) 170 } 171 }() 172 173 if err = cos.FloodWriter(tmpFile, fileSize); err != nil { 174 return fmt.Errorf("failed to write %s %q, err: %w", ftag, tmpFileName, err) 175 } 176 if err = tmpFile.Sync(); err != nil { 177 return fmt.Errorf("failed to sync %s %q, err: %w", ftag, tmpFileName, err) 178 } 179 return nil 180 } 181 182 // the core testing function: reads existing and writes temporary files on mountpath 183 // 1. If the filepath points to existing file, it reads this file 184 // 2. Reads up to maxReads files selected at random 185 // 3. Creates up to maxWrites temporary files 186 // 187 // The function returns the number of read/write errors, and if the mountpath 188 // 189 // is accessible. When the specified local directory is inaccessible the 190 // function returns immediately without any read/write operations 191 func testMountpath(config *cmn.Config, filePath, mountpath string, fileSize int) (readFails, writeFails int, accessible bool) { 192 if cmn.Rom.FastV(4, cos.SmoduleFS) { 193 nlog.Infof("Testing mountpath %q", mountpath) 194 } 195 if err := cos.Stat(mountpath); err != nil { 196 nlog.Errorf("Mountpath %q is unavailable", mountpath) 197 return 0, 0, false 198 } 199 200 totalReads, totalWrites := 0, 0 201 202 // 1. Read the file that causes the error, if it is defined. 203 if filePath != "" { 204 if stat, err := os.Stat(filePath); err == nil && !stat.IsDir() { 205 totalReads++ 206 207 if err := tryReadFile(filePath); err != nil { 208 nlog.Errorf("Failed to read file (fqn: %q, read_fails: %d, err: %v)", filePath, readFails, err) 209 if cos.IsIOError(err) { 210 readFails++ 211 } 212 } 213 } 214 } 215 216 // 2. Read a few more files up to maxReads files. 217 maxTestFiles := config.FSHC.TestFileCount 218 for totalReads < maxTestFiles { 219 fqn, err := getRandomFileName(mountpath) 220 if err == io.EOF { 221 // No files in the mountpath. 222 if cmn.Rom.FastV(4, cos.SmoduleFS) { 223 nlog.Infof("Mountpath %q contains no files", mountpath) 224 } 225 break 226 } 227 totalReads++ 228 if err != nil { 229 if cos.IsIOError(err) { 230 readFails++ 231 } 232 nlog.Errorf("Failed to select a random file (mountpath: %q, read_fails: %d, err: %v)", 233 mountpath, readFails, err, 234 ) 235 continue 236 } 237 if cmn.Rom.FastV(4, cos.SmoduleFS) { 238 nlog.Infof("Reading random file (fqn: %q)", fqn) 239 } 240 if err = tryReadFile(fqn); err != nil { 241 nlog.Errorf("Failed to read file (fqn: %q, err: %v)", fqn, err) 242 if cos.IsIOError(err) { 243 readFails++ 244 } 245 } 246 } 247 248 // 3. Try to create a few random files inside the mountpath. 249 for totalWrites < maxTestFiles { 250 totalWrites++ 251 if err := tryWriteFile(mountpath, int64(fileSize)); err != nil { 252 nlog.Errorf("Failed to write file (mountpath: %q, err: %v)", mountpath, err) 253 if cos.IsIOError(err) { 254 writeFails++ 255 } 256 } 257 } 258 259 if readFails != 0 || writeFails != 0 { 260 nlog.Errorf("Mountpath results (mountpath: %q, read_fails: %d, total_reads: %d, write_fails: %d, total_writes: %d)", 261 mountpath, readFails, totalReads, writeFails, totalWrites, 262 ) 263 } 264 265 return readFails, writeFails, true 266 } 267 268 // gets a base directory and looks for a random file inside it. 269 // Returns an error if any directory cannot be read 270 func getRandomFileName(basePath string) (string, error) { 271 file, err := os.Open(basePath) 272 if err != nil { 273 return "", err 274 } 275 276 files, err := file.ReadDir(fshcMaxFileList) 277 if err == nil { 278 fmap := make(map[string]os.DirEntry, len(files)) 279 for _, ff := range files { 280 fmap[ff.Name()] = ff 281 } 282 283 // look for a non-empty random entry 284 for k, info := range fmap { 285 // it is a file - return its fqn 286 if !info.IsDir() { 287 return filepath.Join(basePath, k), nil 288 } 289 // it is a directory - return a random file from it 290 chosen, err := getRandomFileName(filepath.Join(basePath, k)) 291 if err != nil { 292 return "", err 293 } 294 if chosen != "" { 295 return chosen, nil 296 } 297 } 298 } 299 return "", err 300 }