github.com/charlievieth/fastwalk@v1.0.3/fastwalk.go (about)

     1  // Package fastwalk provides a faster version of filepath.Walk for file system
     2  // scanning tools.
     3  package fastwalk
     4  
     5  /*
     6   * This code borrows heavily from golang.org/x/tools/internal/fastwalk
     7   * and as such the Go license can be found in the go.LICENSE file and
     8   * is reproduced below:
     9   *
    10   * Copyright (c) 2009 The Go Authors. All rights reserved.
    11   *
    12   * Redistribution and use in source and binary forms, with or without
    13   * modification, are permitted provided that the following conditions are
    14   * met:
    15   *
    16   *    * Redistributions of source code must retain the above copyright
    17   * notice, this list of conditions and the following disclaimer.
    18   *    * Redistributions in binary form must reproduce the above
    19   * copyright notice, this list of conditions and the following disclaimer
    20   * in the documentation and/or other materials provided with the
    21   * distribution.
    22   *    * Neither the name of Google Inc. nor the names of its
    23   * contributors may be used to endorse or promote products derived from
    24   * this software without specific prior written permission.
    25   *
    26   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
    27   * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
    28   * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
    29   * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
    30   * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
    31   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
    32   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
    33   * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
    34   * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    35   * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    36   * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    37   */
    38  
    39  import (
    40  	"errors"
    41  	"io/fs"
    42  	"os"
    43  	"path/filepath"
    44  	"runtime"
    45  	"sync"
    46  )
    47  
    48  // ErrTraverseLink is used as a return value from WalkFuncs to indicate that the
    49  // symlink named in the call may be traversed.
    50  var ErrTraverseLink = errors.New("fastwalk: traverse symlink, assuming target is a directory")
    51  
    52  // ErrSkipFiles is a used as a return value from WalkFuncs to indicate that the
    53  // callback should not be called for any other files in the current directory.
    54  // Child directories will still be traversed.
    55  var ErrSkipFiles = errors.New("fastwalk: skip remaining files in directory")
    56  
    57  // SkipDir is used as a return value from WalkDirFuncs to indicate that
    58  // the directory named in the call is to be skipped. It is not returned
    59  // as an error by any function.
    60  var SkipDir = fs.SkipDir
    61  
    62  // DefaultNumWorkers returns the default number of worker goroutines to use in
    63  // fastwalk.Walk and is the value of runtime.GOMAXPROCS(-1) clamped to a range
    64  // of 4 to 32 except on Darwin where it is either 4 (8 cores or less) or 6
    65  // (more than 8 cores). This is because Walk / IO performance on Darwin
    66  // degrades with more concurrency.
    67  //
    68  // The optimal number for your workload may be lower or higher. The results
    69  // of BenchmarkFastWalkNumWorkers benchmark may be informative.
    70  func DefaultNumWorkers() int {
    71  	numCPU := runtime.GOMAXPROCS(-1)
    72  	if numCPU < 4 {
    73  		return 4
    74  	}
    75  	// Darwin IO performance on APFS slows with more workers.
    76  	// Stat performance is best around 2-4 and file IO is best
    77  	// around 4-6. More workers only benefit CPU intensive tasks.
    78  	if runtime.GOOS == "darwin" {
    79  		if numCPU <= 8 {
    80  			return 4
    81  		}
    82  		return 6
    83  	}
    84  	if numCPU > 32 {
    85  		return 32
    86  	}
    87  	return numCPU
    88  }
    89  
    90  // DefaultConfig is the default Config used when none is supplied.
    91  var DefaultConfig = Config{
    92  	Follow:     false,
    93  	NumWorkers: DefaultNumWorkers(),
    94  }
    95  
    96  type Config struct {
    97  	// TODO: do we want to pass a sentinel error to WalkFunc if
    98  	// a symlink loop is detected?
    99  
   100  	// Follow symbolic links ignoring directories that would lead
   101  	// to infinite loops; that is, entering a previously visited
   102  	// directory that is an ancestor of the last file encountered.
   103  	//
   104  	// The sentinel error ErrTraverseLink is ignored when Follow
   105  	// is true (this to prevent users from defeating the loop
   106  	// detection logic), but SkipDir and ErrSkipFiles are still
   107  	// respected.
   108  	Follow bool
   109  
   110  	// Number of parallel workers to use. If NumWorkers if ≤ 0 then
   111  	// the greater of runtime.NumCPU() or 4 is used.
   112  	NumWorkers int
   113  }
   114  
   115  // A DirEntry extends the fs.DirEntry interface to add a Stat() method
   116  // that returns the result of calling os.Stat() on the underlying file.
   117  // The results of Info() and Stat() are cached.
   118  //
   119  // The fs.DirEntry argument passed to the fs.WalkDirFunc by Walk is
   120  // always a DirEntry. The only exception is the root directory with
   121  // with Walk is called.
   122  type DirEntry interface {
   123  	fs.DirEntry
   124  
   125  	// Stat returns the FileInfo for the file or subdirectory described
   126  	// by the entry. The returned FileInfo may be from the time of the
   127  	// original directory read or from the time of the call to Stat.
   128  	// If the entry denotes a symbolic link, Stat reports the information
   129  	// about the target itself, not the link.
   130  	Stat() (fs.FileInfo, error)
   131  }
   132  
   133  // Walk is a faster implementation of filepath.Walk.
   134  //
   135  // filepath.Walk's design necessarily calls os.Lstat on each file, even if
   136  // the caller needs less info. Many tools need only the type of each file.
   137  // On some platforms, this information is provided directly by the readdir
   138  // system call, avoiding the need to stat each file individually.
   139  // fastwalk_unix.go contains a fork of the syscall routines.
   140  //
   141  // See golang.org/issue/16399
   142  //
   143  // Walk walks the file tree rooted at root, calling walkFn for each file or
   144  // directory in the tree, including root.
   145  //
   146  // If walkFn returns filepath.SkipDir, the directory is skipped.
   147  //
   148  // Unlike filepath.WalkDir:
   149  //   - File stat calls must be done by the user and should be done via
   150  //     the DirEntry argument to walkFn since it caches the results of
   151  //     Stat and Lstat.
   152  //   - The fs.DirEntry argument is always a fastwalk.DirEntry, which has
   153  //     a Stat() method that returns the result of calling os.Stat() on the
   154  //     file. The result of Stat() may be cached.
   155  //   - Multiple goroutines stat the filesystem concurrently. The provided
   156  //     walkFn must be safe for concurrent use.
   157  //   - Walk can follow symlinks if walkFn returns the ErrTraverseLink
   158  //     sentinel error. It is the walkFn's responsibility to prevent
   159  //     Walk from going into symlink cycles.
   160  func Walk(conf *Config, root string, walkFn fs.WalkDirFunc) error {
   161  	if conf == nil {
   162  		dupe := DefaultConfig
   163  		conf = &dupe
   164  	}
   165  	fi, err := os.Lstat(root)
   166  	if err != nil {
   167  		return err
   168  	}
   169  
   170  	// Make sure to wait for all workers to finish, otherwise
   171  	// walkFn could still be called after returning. This Wait call
   172  	// runs after close(e.donec) below.
   173  	var wg sync.WaitGroup
   174  	defer wg.Wait()
   175  
   176  	numWorkers := conf.NumWorkers
   177  	if numWorkers <= 0 {
   178  		numWorkers = DefaultNumWorkers()
   179  	}
   180  
   181  	w := &walker{
   182  		fn:       walkFn,
   183  		enqueuec: make(chan walkItem, numWorkers), // buffered for performance
   184  		workc:    make(chan walkItem, numWorkers), // buffered for performance
   185  		donec:    make(chan struct{}),
   186  
   187  		// buffered for correctness & not leaking goroutines:
   188  		resc: make(chan error, numWorkers),
   189  
   190  		follow: conf.Follow,
   191  	}
   192  	if w.follow {
   193  		if fi, err := os.Stat(root); err == nil {
   194  			w.ignoredDirs = append(w.ignoredDirs, fi)
   195  		}
   196  	}
   197  
   198  	defer close(w.donec)
   199  
   200  	for i := 0; i < numWorkers; i++ {
   201  		wg.Add(1)
   202  		go w.doWork(&wg)
   203  	}
   204  
   205  	root = cleanRootPath(root)
   206  	todo := []walkItem{{dir: root, info: fileInfoToDirEntry(filepath.Dir(root), fi)}}
   207  	out := 0
   208  	for {
   209  		workc := w.workc
   210  		var workItem walkItem
   211  		if len(todo) == 0 {
   212  			workc = nil
   213  		} else {
   214  			workItem = todo[len(todo)-1]
   215  		}
   216  		select {
   217  		case workc <- workItem:
   218  			todo = todo[:len(todo)-1]
   219  			out++
   220  		case it := <-w.enqueuec:
   221  			todo = append(todo, it)
   222  		case err := <-w.resc:
   223  			out--
   224  			if err != nil {
   225  				return err
   226  			}
   227  			if out == 0 && len(todo) == 0 {
   228  				// It's safe to quit here, as long as the buffered
   229  				// enqueue channel isn't also readable, which might
   230  				// happen if the worker sends both another unit of
   231  				// work and its result before the other select was
   232  				// scheduled and both w.resc and w.enqueuec were
   233  				// readable.
   234  				select {
   235  				case it := <-w.enqueuec:
   236  					todo = append(todo, it)
   237  				default:
   238  					return nil
   239  				}
   240  			}
   241  		}
   242  	}
   243  }
   244  
   245  // doWork reads directories as instructed (via workc) and runs the
   246  // user's callback function.
   247  func (w *walker) doWork(wg *sync.WaitGroup) {
   248  	defer wg.Done()
   249  	for {
   250  		select {
   251  		case <-w.donec:
   252  			return
   253  		case it := <-w.workc:
   254  			select {
   255  			case <-w.donec:
   256  				return
   257  			case w.resc <- w.walk(it.dir, it.info, !it.callbackDone):
   258  			}
   259  		}
   260  	}
   261  }
   262  
   263  type walker struct {
   264  	fn fs.WalkDirFunc
   265  
   266  	donec    chan struct{} // closed on fastWalk's return
   267  	workc    chan walkItem // to workers
   268  	enqueuec chan walkItem // from workers
   269  	resc     chan error    // from workers
   270  
   271  	ignoredDirs []os.FileInfo
   272  	follow      bool
   273  }
   274  
   275  type walkItem struct {
   276  	dir          string
   277  	info         fs.DirEntry
   278  	callbackDone bool // callback already called; don't do it again
   279  }
   280  
   281  func (w *walker) enqueue(it walkItem) {
   282  	select {
   283  	case w.enqueuec <- it:
   284  	case <-w.donec:
   285  	}
   286  }
   287  
   288  func (w *walker) shouldSkipDir(fi os.FileInfo) bool {
   289  	for _, ignored := range w.ignoredDirs {
   290  		if os.SameFile(ignored, fi) {
   291  			return true
   292  		}
   293  	}
   294  	return false
   295  }
   296  
   297  func (w *walker) shouldTraverse(path string, de fs.DirEntry) bool {
   298  	// TODO: do we need to use filepath.EvalSymlinks() here?
   299  	ts, err := StatDirEntry(path, de)
   300  	if err != nil {
   301  		return false
   302  	}
   303  	if !ts.IsDir() {
   304  		return false
   305  	}
   306  	if w.shouldSkipDir(ts) {
   307  		return false
   308  	}
   309  	for {
   310  		parent := filepath.Dir(path)
   311  		if parent == path {
   312  			return true
   313  		}
   314  		parentInfo, err := os.Stat(parent)
   315  		if err != nil {
   316  			return false
   317  		}
   318  		if os.SameFile(ts, parentInfo) {
   319  			return false
   320  		}
   321  		path = parent
   322  	}
   323  }
   324  
   325  func joinPaths(dir, base string) string {
   326  	// Handle the case where the root path argument to Walk is "/"
   327  	// without this the returned path is prefixed with "//".
   328  	if os.PathSeparator == '/' && dir == "/" {
   329  		return dir + base
   330  	}
   331  	return dir + string(os.PathSeparator) + base
   332  }
   333  
   334  func (w *walker) onDirEnt(dirName, baseName string, de fs.DirEntry) error {
   335  	joined := joinPaths(dirName, baseName)
   336  	typ := de.Type()
   337  	if typ == os.ModeDir {
   338  		w.enqueue(walkItem{dir: joined, info: de})
   339  		return nil
   340  	}
   341  
   342  	err := w.fn(joined, de, nil)
   343  	if typ == os.ModeSymlink {
   344  		if err == ErrTraverseLink {
   345  			if !w.follow {
   346  				// Set callbackDone so we don't call it twice for both the
   347  				// symlink-as-symlink and the symlink-as-directory later:
   348  				w.enqueue(walkItem{dir: joined, info: de, callbackDone: true})
   349  				return nil
   350  			}
   351  			err = nil // Ignore ErrTraverseLink when Follow is true.
   352  		}
   353  		if err == filepath.SkipDir {
   354  			// Permit SkipDir on symlinks too.
   355  			return nil
   356  		}
   357  		if err == nil && w.follow && w.shouldTraverse(joined, de) {
   358  			// Traverse symlink
   359  			w.enqueue(walkItem{dir: joined, info: de, callbackDone: true})
   360  		}
   361  	}
   362  	return err
   363  }
   364  
   365  func (w *walker) walk(root string, info fs.DirEntry, runUserCallback bool) error {
   366  	if runUserCallback {
   367  		err := w.fn(root, info, nil)
   368  		if err == filepath.SkipDir {
   369  			return nil
   370  		}
   371  		if err != nil {
   372  			return err
   373  		}
   374  	}
   375  
   376  	err := readDir(root, w.onDirEnt)
   377  	if err != nil {
   378  		// Second call, to report ReadDir error.
   379  		return w.fn(root, info, err)
   380  	}
   381  	return nil
   382  }
   383  
   384  func cleanRootPath(root string) string {
   385  	for i := len(root) - 1; i >= 0; i-- {
   386  		if !os.IsPathSeparator(root[i]) {
   387  			return root[:i+1]
   388  		}
   389  	}
   390  	if root != "" {
   391  		return root[0:1] // root is all path separators ("//")
   392  	}
   393  	return root
   394  }