github.com/charlievieth/fastwalk@v1.0.3/fastwalk.go (about) 1 // Package fastwalk provides a faster version of filepath.Walk for file system 2 // scanning tools. 3 package fastwalk 4 5 /* 6 * This code borrows heavily from golang.org/x/tools/internal/fastwalk 7 * and as such the Go license can be found in the go.LICENSE file and 8 * is reproduced below: 9 * 10 * Copyright (c) 2009 The Go Authors. All rights reserved. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions are 14 * met: 15 * 16 * * Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * * Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following disclaimer 20 * in the documentation and/or other materials provided with the 21 * distribution. 22 * * Neither the name of Google Inc. nor the names of its 23 * contributors may be used to endorse or promote products derived from 24 * this software without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 27 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 28 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 29 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 30 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 31 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 32 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 33 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 34 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 35 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 36 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37 */ 38 39 import ( 40 "errors" 41 "io/fs" 42 "os" 43 "path/filepath" 44 "runtime" 45 "sync" 46 ) 47 48 // ErrTraverseLink is used as a return value from WalkFuncs to indicate that the 49 // symlink named in the call may be traversed. 50 var ErrTraverseLink = errors.New("fastwalk: traverse symlink, assuming target is a directory") 51 52 // ErrSkipFiles is a used as a return value from WalkFuncs to indicate that the 53 // callback should not be called for any other files in the current directory. 54 // Child directories will still be traversed. 55 var ErrSkipFiles = errors.New("fastwalk: skip remaining files in directory") 56 57 // SkipDir is used as a return value from WalkDirFuncs to indicate that 58 // the directory named in the call is to be skipped. It is not returned 59 // as an error by any function. 60 var SkipDir = fs.SkipDir 61 62 // DefaultNumWorkers returns the default number of worker goroutines to use in 63 // fastwalk.Walk and is the value of runtime.GOMAXPROCS(-1) clamped to a range 64 // of 4 to 32 except on Darwin where it is either 4 (8 cores or less) or 6 65 // (more than 8 cores). This is because Walk / IO performance on Darwin 66 // degrades with more concurrency. 67 // 68 // The optimal number for your workload may be lower or higher. The results 69 // of BenchmarkFastWalkNumWorkers benchmark may be informative. 70 func DefaultNumWorkers() int { 71 numCPU := runtime.GOMAXPROCS(-1) 72 if numCPU < 4 { 73 return 4 74 } 75 // Darwin IO performance on APFS slows with more workers. 76 // Stat performance is best around 2-4 and file IO is best 77 // around 4-6. More workers only benefit CPU intensive tasks. 78 if runtime.GOOS == "darwin" { 79 if numCPU <= 8 { 80 return 4 81 } 82 return 6 83 } 84 if numCPU > 32 { 85 return 32 86 } 87 return numCPU 88 } 89 90 // DefaultConfig is the default Config used when none is supplied. 91 var DefaultConfig = Config{ 92 Follow: false, 93 NumWorkers: DefaultNumWorkers(), 94 } 95 96 type Config struct { 97 // TODO: do we want to pass a sentinel error to WalkFunc if 98 // a symlink loop is detected? 99 100 // Follow symbolic links ignoring directories that would lead 101 // to infinite loops; that is, entering a previously visited 102 // directory that is an ancestor of the last file encountered. 103 // 104 // The sentinel error ErrTraverseLink is ignored when Follow 105 // is true (this to prevent users from defeating the loop 106 // detection logic), but SkipDir and ErrSkipFiles are still 107 // respected. 108 Follow bool 109 110 // Number of parallel workers to use. If NumWorkers if ≤ 0 then 111 // the greater of runtime.NumCPU() or 4 is used. 112 NumWorkers int 113 } 114 115 // A DirEntry extends the fs.DirEntry interface to add a Stat() method 116 // that returns the result of calling os.Stat() on the underlying file. 117 // The results of Info() and Stat() are cached. 118 // 119 // The fs.DirEntry argument passed to the fs.WalkDirFunc by Walk is 120 // always a DirEntry. The only exception is the root directory with 121 // with Walk is called. 122 type DirEntry interface { 123 fs.DirEntry 124 125 // Stat returns the FileInfo for the file or subdirectory described 126 // by the entry. The returned FileInfo may be from the time of the 127 // original directory read or from the time of the call to Stat. 128 // If the entry denotes a symbolic link, Stat reports the information 129 // about the target itself, not the link. 130 Stat() (fs.FileInfo, error) 131 } 132 133 // Walk is a faster implementation of filepath.Walk. 134 // 135 // filepath.Walk's design necessarily calls os.Lstat on each file, even if 136 // the caller needs less info. Many tools need only the type of each file. 137 // On some platforms, this information is provided directly by the readdir 138 // system call, avoiding the need to stat each file individually. 139 // fastwalk_unix.go contains a fork of the syscall routines. 140 // 141 // See golang.org/issue/16399 142 // 143 // Walk walks the file tree rooted at root, calling walkFn for each file or 144 // directory in the tree, including root. 145 // 146 // If walkFn returns filepath.SkipDir, the directory is skipped. 147 // 148 // Unlike filepath.WalkDir: 149 // - File stat calls must be done by the user and should be done via 150 // the DirEntry argument to walkFn since it caches the results of 151 // Stat and Lstat. 152 // - The fs.DirEntry argument is always a fastwalk.DirEntry, which has 153 // a Stat() method that returns the result of calling os.Stat() on the 154 // file. The result of Stat() may be cached. 155 // - Multiple goroutines stat the filesystem concurrently. The provided 156 // walkFn must be safe for concurrent use. 157 // - Walk can follow symlinks if walkFn returns the ErrTraverseLink 158 // sentinel error. It is the walkFn's responsibility to prevent 159 // Walk from going into symlink cycles. 160 func Walk(conf *Config, root string, walkFn fs.WalkDirFunc) error { 161 if conf == nil { 162 dupe := DefaultConfig 163 conf = &dupe 164 } 165 fi, err := os.Lstat(root) 166 if err != nil { 167 return err 168 } 169 170 // Make sure to wait for all workers to finish, otherwise 171 // walkFn could still be called after returning. This Wait call 172 // runs after close(e.donec) below. 173 var wg sync.WaitGroup 174 defer wg.Wait() 175 176 numWorkers := conf.NumWorkers 177 if numWorkers <= 0 { 178 numWorkers = DefaultNumWorkers() 179 } 180 181 w := &walker{ 182 fn: walkFn, 183 enqueuec: make(chan walkItem, numWorkers), // buffered for performance 184 workc: make(chan walkItem, numWorkers), // buffered for performance 185 donec: make(chan struct{}), 186 187 // buffered for correctness & not leaking goroutines: 188 resc: make(chan error, numWorkers), 189 190 follow: conf.Follow, 191 } 192 if w.follow { 193 if fi, err := os.Stat(root); err == nil { 194 w.ignoredDirs = append(w.ignoredDirs, fi) 195 } 196 } 197 198 defer close(w.donec) 199 200 for i := 0; i < numWorkers; i++ { 201 wg.Add(1) 202 go w.doWork(&wg) 203 } 204 205 root = cleanRootPath(root) 206 todo := []walkItem{{dir: root, info: fileInfoToDirEntry(filepath.Dir(root), fi)}} 207 out := 0 208 for { 209 workc := w.workc 210 var workItem walkItem 211 if len(todo) == 0 { 212 workc = nil 213 } else { 214 workItem = todo[len(todo)-1] 215 } 216 select { 217 case workc <- workItem: 218 todo = todo[:len(todo)-1] 219 out++ 220 case it := <-w.enqueuec: 221 todo = append(todo, it) 222 case err := <-w.resc: 223 out-- 224 if err != nil { 225 return err 226 } 227 if out == 0 && len(todo) == 0 { 228 // It's safe to quit here, as long as the buffered 229 // enqueue channel isn't also readable, which might 230 // happen if the worker sends both another unit of 231 // work and its result before the other select was 232 // scheduled and both w.resc and w.enqueuec were 233 // readable. 234 select { 235 case it := <-w.enqueuec: 236 todo = append(todo, it) 237 default: 238 return nil 239 } 240 } 241 } 242 } 243 } 244 245 // doWork reads directories as instructed (via workc) and runs the 246 // user's callback function. 247 func (w *walker) doWork(wg *sync.WaitGroup) { 248 defer wg.Done() 249 for { 250 select { 251 case <-w.donec: 252 return 253 case it := <-w.workc: 254 select { 255 case <-w.donec: 256 return 257 case w.resc <- w.walk(it.dir, it.info, !it.callbackDone): 258 } 259 } 260 } 261 } 262 263 type walker struct { 264 fn fs.WalkDirFunc 265 266 donec chan struct{} // closed on fastWalk's return 267 workc chan walkItem // to workers 268 enqueuec chan walkItem // from workers 269 resc chan error // from workers 270 271 ignoredDirs []os.FileInfo 272 follow bool 273 } 274 275 type walkItem struct { 276 dir string 277 info fs.DirEntry 278 callbackDone bool // callback already called; don't do it again 279 } 280 281 func (w *walker) enqueue(it walkItem) { 282 select { 283 case w.enqueuec <- it: 284 case <-w.donec: 285 } 286 } 287 288 func (w *walker) shouldSkipDir(fi os.FileInfo) bool { 289 for _, ignored := range w.ignoredDirs { 290 if os.SameFile(ignored, fi) { 291 return true 292 } 293 } 294 return false 295 } 296 297 func (w *walker) shouldTraverse(path string, de fs.DirEntry) bool { 298 // TODO: do we need to use filepath.EvalSymlinks() here? 299 ts, err := StatDirEntry(path, de) 300 if err != nil { 301 return false 302 } 303 if !ts.IsDir() { 304 return false 305 } 306 if w.shouldSkipDir(ts) { 307 return false 308 } 309 for { 310 parent := filepath.Dir(path) 311 if parent == path { 312 return true 313 } 314 parentInfo, err := os.Stat(parent) 315 if err != nil { 316 return false 317 } 318 if os.SameFile(ts, parentInfo) { 319 return false 320 } 321 path = parent 322 } 323 } 324 325 func joinPaths(dir, base string) string { 326 // Handle the case where the root path argument to Walk is "/" 327 // without this the returned path is prefixed with "//". 328 if os.PathSeparator == '/' && dir == "/" { 329 return dir + base 330 } 331 return dir + string(os.PathSeparator) + base 332 } 333 334 func (w *walker) onDirEnt(dirName, baseName string, de fs.DirEntry) error { 335 joined := joinPaths(dirName, baseName) 336 typ := de.Type() 337 if typ == os.ModeDir { 338 w.enqueue(walkItem{dir: joined, info: de}) 339 return nil 340 } 341 342 err := w.fn(joined, de, nil) 343 if typ == os.ModeSymlink { 344 if err == ErrTraverseLink { 345 if !w.follow { 346 // Set callbackDone so we don't call it twice for both the 347 // symlink-as-symlink and the symlink-as-directory later: 348 w.enqueue(walkItem{dir: joined, info: de, callbackDone: true}) 349 return nil 350 } 351 err = nil // Ignore ErrTraverseLink when Follow is true. 352 } 353 if err == filepath.SkipDir { 354 // Permit SkipDir on symlinks too. 355 return nil 356 } 357 if err == nil && w.follow && w.shouldTraverse(joined, de) { 358 // Traverse symlink 359 w.enqueue(walkItem{dir: joined, info: de, callbackDone: true}) 360 } 361 } 362 return err 363 } 364 365 func (w *walker) walk(root string, info fs.DirEntry, runUserCallback bool) error { 366 if runUserCallback { 367 err := w.fn(root, info, nil) 368 if err == filepath.SkipDir { 369 return nil 370 } 371 if err != nil { 372 return err 373 } 374 } 375 376 err := readDir(root, w.onDirEnt) 377 if err != nil { 378 // Second call, to report ReadDir error. 379 return w.fn(root, info, err) 380 } 381 return nil 382 } 383 384 func cleanRootPath(root string) string { 385 for i := len(root) - 1; i >= 0; i-- { 386 if !os.IsPathSeparator(root[i]) { 387 return root[:i+1] 388 } 389 } 390 if root != "" { 391 return root[0:1] // root is all path separators ("//") 392 } 393 return root 394 }