github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/fs/health/fshc.go (about)

     1  // Package health provides a basic mountpath health monitor.
     2  /*
     3   * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
     4   *
     5   */
     6  package health
     7  
     8  import (
     9  	"errors"
    10  	"fmt"
    11  	"io"
    12  	"os"
    13  	"path/filepath"
    14  
    15  	"github.com/NVIDIA/aistore/cmn"
    16  	"github.com/NVIDIA/aistore/cmn/cos"
    17  	"github.com/NVIDIA/aistore/cmn/nlog"
    18  	"github.com/NVIDIA/aistore/fs"
    19  )
    20  
    21  const (
    22  	fshcFileSize    = 10 * cos.MiB // size of temporary file which will test writing and reading the mountpath
    23  	fshcMaxFileList = 100          // maximum number of files to read by Readdir
    24  
    25  	fshcTemp = "fshc"
    26  )
    27  
    28  // When an IO error is triggered, it runs a few tests to make sure that the
    29  // failed mountpath is healthy. Once the mountpath is considered faulty the
    30  // mountpath is disabled and removed from the list.
    31  //
    32  // for mountpath definition, see fs/mountfs.go
    33  type (
    34  	fspathDispatcher interface {
    35  		DisableMpath(mpath, reason string) (err error)
    36  	}
    37  	FSHC struct {
    38  		dispatcher fspathDispatcher // listener is notified upon mountpath events (disabled, etc.)
    39  		fileListCh chan string
    40  		stopCh     cos.StopCh
    41  	}
    42  )
    43  
    44  //////////
    45  // FSHC //
    46  //////////
    47  
    48  // interface guard
    49  var _ cos.Runner = (*FSHC)(nil)
    50  
    51  func NewFSHC(dispatcher fspathDispatcher) (f *FSHC) {
    52  	f = &FSHC{dispatcher: dispatcher, fileListCh: make(chan string, 100)}
    53  	f.stopCh.Init()
    54  	return
    55  }
    56  
    57  func (*FSHC) Name() string { return "fshc" }
    58  
    59  func (f *FSHC) Run() error {
    60  	nlog.Infof("Starting %s", f.Name())
    61  
    62  	for {
    63  		select {
    64  		case filePath := <-f.fileListCh:
    65  			mi, err := fs.Path2Mpath(filePath)
    66  			if err != nil {
    67  				nlog.Errorln(err)
    68  				break
    69  			}
    70  
    71  			f.runMpathTest(mi.Path, filePath)
    72  		case <-f.stopCh.Listen():
    73  			return nil
    74  		}
    75  	}
    76  }
    77  
    78  func (f *FSHC) Stop(err error) {
    79  	nlog.Infof("Stopping %s, err: %v", f.Name(), err)
    80  	f.stopCh.Close()
    81  }
    82  
    83  func (f *FSHC) OnErr(fqn string) {
    84  	if !cmn.GCO.Get().FSHC.Enabled {
    85  		return
    86  	}
    87  	f.fileListCh <- fqn
    88  }
    89  
    90  func isTestPassed(mpath string, readErrors, writeErrors int, available bool) (passed bool, err error) {
    91  	config := &cmn.GCO.Get().FSHC
    92  	nlog.Infof("Tested mountpath %s(%v), read: %d of %d, write(size=%d): %d of %d",
    93  		mpath, available,
    94  		readErrors, config.ErrorLimit, fshcFileSize,
    95  		writeErrors, config.ErrorLimit)
    96  
    97  	if !available {
    98  		return false, errors.New("mountpath is unavailable")
    99  	}
   100  
   101  	passed = readErrors < config.ErrorLimit && writeErrors < config.ErrorLimit
   102  	if !passed {
   103  		err = fmt.Errorf("too many errors: %d read error%s, %d write error%s",
   104  			readErrors, cos.Plural(readErrors), writeErrors, cos.Plural(writeErrors))
   105  	}
   106  	return passed, err
   107  }
   108  
   109  func (f *FSHC) runMpathTest(mpath, filepath string) {
   110  	var (
   111  		config    = cmn.GCO.Get()
   112  		whyFailed error
   113  		passed    bool
   114  	)
   115  	readErrs, writeErrs, exists := testMountpath(config, filepath, mpath, fshcFileSize)
   116  	if passed, whyFailed = isTestPassed(mpath, readErrs, writeErrs, exists); passed {
   117  		return
   118  	}
   119  	nlog.Errorf("Disabling mountpath %s...", mpath)
   120  	if err := f.dispatcher.DisableMpath(mpath, whyFailed.Error()); err != nil {
   121  		nlog.Errorf("Failed to disable mountpath: %s", err.Error())
   122  	}
   123  }
   124  
   125  // reads the entire file content
   126  func tryReadFile(fqn string) error {
   127  	file, err := fs.DirectOpen(fqn, os.O_RDONLY, 0)
   128  	if err != nil {
   129  		return err
   130  	}
   131  	if _, err := io.Copy(io.Discard, file); err != nil {
   132  		_ = file.Close()
   133  		return err
   134  	}
   135  	return file.Close()
   136  }
   137  
   138  // Creates a random file in a random directory inside a mountpath.
   139  func tryWriteFile(mpath string, fileSize int64) error {
   140  	const ftag = "temp file"
   141  	// Do not test a mountpath if it is already disabled. To avoid a race
   142  	// when a lot of PUTs fail and each one calls FSHC, FSHC disables
   143  	// the mountpath on the first run, so all other tryWriteFile are redundant
   144  	available, disabled := fs.Get()
   145  	if _, ok := disabled[mpath]; ok {
   146  		return nil
   147  	}
   148  	mi, ok := available[mpath]
   149  	if !ok {
   150  		nlog.Warningf("Tried to write %s to non-existing mountpath %q", ftag, mpath)
   151  		return nil
   152  	}
   153  
   154  	tmpDir := mi.TempDir(fshcTemp)
   155  	if err := cos.CreateDir(tmpDir); err != nil {
   156  		return fmt.Errorf("failed to create directory %s: %w", tmpDir, err)
   157  	}
   158  	tmpFileName := filepath.Join(tmpDir, "fshc-try-write-"+cos.CryptoRandS(10))
   159  	tmpFile, err := fs.DirectOpen(tmpFileName, os.O_RDWR|os.O_CREATE|os.O_TRUNC, cos.PermRWR)
   160  	if err != nil {
   161  		return fmt.Errorf("failed to create %s, err: %w", ftag, err)
   162  	}
   163  
   164  	defer func() {
   165  		if err := tmpFile.Close(); err != nil {
   166  			nlog.Errorf("Failed to close %s %q, err: %v", ftag, tmpFileName, err)
   167  		}
   168  		if err := cos.RemoveFile(tmpFileName); err != nil {
   169  			nlog.Errorf("Failed to remove %s %q, err: %v", ftag, tmpFileName, err)
   170  		}
   171  	}()
   172  
   173  	if err = cos.FloodWriter(tmpFile, fileSize); err != nil {
   174  		return fmt.Errorf("failed to write %s %q, err: %w", ftag, tmpFileName, err)
   175  	}
   176  	if err = tmpFile.Sync(); err != nil {
   177  		return fmt.Errorf("failed to sync %s %q, err: %w", ftag, tmpFileName, err)
   178  	}
   179  	return nil
   180  }
   181  
   182  // the core testing function: reads existing and writes temporary files on mountpath
   183  //  1. If the filepath points to existing file, it reads this file
   184  //  2. Reads up to maxReads files selected at random
   185  //  3. Creates up to maxWrites temporary files
   186  //
   187  // The function returns the number of read/write errors, and if the mountpath
   188  //
   189  //	is accessible. When the specified local directory is inaccessible the
   190  //	function returns immediately without any read/write operations
   191  func testMountpath(config *cmn.Config, filePath, mountpath string, fileSize int) (readFails, writeFails int, accessible bool) {
   192  	if cmn.Rom.FastV(4, cos.SmoduleFS) {
   193  		nlog.Infof("Testing mountpath %q", mountpath)
   194  	}
   195  	if err := cos.Stat(mountpath); err != nil {
   196  		nlog.Errorf("Mountpath %q is unavailable", mountpath)
   197  		return 0, 0, false
   198  	}
   199  
   200  	totalReads, totalWrites := 0, 0
   201  
   202  	// 1. Read the file that causes the error, if it is defined.
   203  	if filePath != "" {
   204  		if stat, err := os.Stat(filePath); err == nil && !stat.IsDir() {
   205  			totalReads++
   206  
   207  			if err := tryReadFile(filePath); err != nil {
   208  				nlog.Errorf("Failed to read file (fqn: %q, read_fails: %d, err: %v)", filePath, readFails, err)
   209  				if cos.IsIOError(err) {
   210  					readFails++
   211  				}
   212  			}
   213  		}
   214  	}
   215  
   216  	// 2. Read a few more files up to maxReads files.
   217  	maxTestFiles := config.FSHC.TestFileCount
   218  	for totalReads < maxTestFiles {
   219  		fqn, err := getRandomFileName(mountpath)
   220  		if err == io.EOF {
   221  			// No files in the mountpath.
   222  			if cmn.Rom.FastV(4, cos.SmoduleFS) {
   223  				nlog.Infof("Mountpath %q contains no files", mountpath)
   224  			}
   225  			break
   226  		}
   227  		totalReads++
   228  		if err != nil {
   229  			if cos.IsIOError(err) {
   230  				readFails++
   231  			}
   232  			nlog.Errorf("Failed to select a random file (mountpath: %q, read_fails: %d, err: %v)",
   233  				mountpath, readFails, err,
   234  			)
   235  			continue
   236  		}
   237  		if cmn.Rom.FastV(4, cos.SmoduleFS) {
   238  			nlog.Infof("Reading random file (fqn: %q)", fqn)
   239  		}
   240  		if err = tryReadFile(fqn); err != nil {
   241  			nlog.Errorf("Failed to read file (fqn: %q, err: %v)", fqn, err)
   242  			if cos.IsIOError(err) {
   243  				readFails++
   244  			}
   245  		}
   246  	}
   247  
   248  	// 3. Try to create a few random files inside the mountpath.
   249  	for totalWrites < maxTestFiles {
   250  		totalWrites++
   251  		if err := tryWriteFile(mountpath, int64(fileSize)); err != nil {
   252  			nlog.Errorf("Failed to write file (mountpath: %q, err: %v)", mountpath, err)
   253  			if cos.IsIOError(err) {
   254  				writeFails++
   255  			}
   256  		}
   257  	}
   258  
   259  	if readFails != 0 || writeFails != 0 {
   260  		nlog.Errorf("Mountpath results (mountpath: %q, read_fails: %d, total_reads: %d, write_fails: %d, total_writes: %d)",
   261  			mountpath, readFails, totalReads, writeFails, totalWrites,
   262  		)
   263  	}
   264  
   265  	return readFails, writeFails, true
   266  }
   267  
   268  // gets a base directory and looks for a random file inside it.
   269  // Returns an error if any directory cannot be read
   270  func getRandomFileName(basePath string) (string, error) {
   271  	file, err := os.Open(basePath)
   272  	if err != nil {
   273  		return "", err
   274  	}
   275  
   276  	files, err := file.ReadDir(fshcMaxFileList)
   277  	if err == nil {
   278  		fmap := make(map[string]os.DirEntry, len(files))
   279  		for _, ff := range files {
   280  			fmap[ff.Name()] = ff
   281  		}
   282  
   283  		// look for a non-empty random entry
   284  		for k, info := range fmap {
   285  			// it is a file - return its fqn
   286  			if !info.IsDir() {
   287  				return filepath.Join(basePath, k), nil
   288  			}
   289  			// it is a directory - return a random file from it
   290  			chosen, err := getRandomFileName(filepath.Join(basePath, k))
   291  			if err != nil {
   292  				return "", err
   293  			}
   294  			if chosen != "" {
   295  				return chosen, nil
   296  			}
   297  		}
   298  	}
   299  	return "", err
   300  }