github.com/boyter/gocodewalker@v1.3.2/file.go (about)

     1  // Package file provides file operations specific to code repositories
     2  // such as walking the file tree obeying .ignore and .gitignore files
     3  // or looking for the root directory assuming already in a git project
     4  
     5  // SPDX-License-Identifier: MIT OR Unlicense
     6  
     7  package gocodewalker
     8  
     9  import (
    10  	"bytes"
    11  	"errors"
    12  	"github.com/boyter/gocodewalker/go-gitignore"
    13  	"golang.org/x/sync/errgroup"
    14  	"os"
    15  	"path"
    16  	"path/filepath"
    17  	"regexp"
    18  	"strings"
    19  	"sync"
    20  )
    21  
    22  const (
    23  	GitIgnore = ".gitignore"
    24  	Ignore    = ".ignore"
    25  )
    26  
    27  // ErrTerminateWalk error which indicates that the walker was terminated
    28  var ErrTerminateWalk = errors.New("gocodewalker terminated")
    29  
    30  // File is a struct returned which contains the location and the filename of the file that passed all exclusion rules
    31  type File struct {
    32  	Location string
    33  	Filename string
    34  }
    35  
    36  type FileWalker struct {
    37  	fileListQueue          chan *File
    38  	errorsHandler          func(error) bool // If returns true will continue to process where possible, otherwise returns if possible
    39  	directory              string
    40  	directories            []string
    41  	LocationExcludePattern []string // Case-sensitive patterns which exclude directory/file matches
    42  	IncludeDirectory       []string
    43  	ExcludeDirectory       []string // Paths to always ignore such as .git,.svn and .hg
    44  	IncludeFilename        []string
    45  	ExcludeFilename        []string
    46  	IncludeDirectoryRegex  []*regexp.Regexp // Must match regex as logical OR IE can match any of them
    47  	ExcludeDirectoryRegex  []*regexp.Regexp
    48  	IncludeFilenameRegex   []*regexp.Regexp
    49  	ExcludeFilenameRegex   []*regexp.Regexp
    50  	AllowListExtensions    []string // Which extensions should be allowed case sensitive
    51  	ExcludeListExtensions  []string // Which extensions should be excluded case sensitive
    52  	walkMutex              sync.Mutex
    53  	terminateWalking       bool
    54  	isWalking              bool
    55  	IgnoreIgnoreFile       bool // Should .ignore files be respected?
    56  	IgnoreGitIgnore        bool // Should .gitignore files be respected?
    57  	IncludeHidden          bool // Should hidden files and directories be included/walked
    58  	osOpen                 func(name string) (*os.File, error)
    59  	osReadFile             func(name string) ([]byte, error)
    60  }
    61  
    62  // NewFileWalker constructs a filewalker, which will walk the supplied directory
    63  // and output File results to the supplied queue as it finds them
    64  func NewFileWalker(directory string, fileListQueue chan *File) *FileWalker {
    65  	return &FileWalker{
    66  		fileListQueue:          fileListQueue,
    67  		errorsHandler:          func(e error) bool { return true }, // a generic one that just swallows everything
    68  		directory:              directory,
    69  		LocationExcludePattern: nil,
    70  		IncludeDirectory:       nil,
    71  		ExcludeDirectory:       nil,
    72  		IncludeFilename:        nil,
    73  		ExcludeFilename:        nil,
    74  		IncludeDirectoryRegex:  nil,
    75  		ExcludeDirectoryRegex:  nil,
    76  		IncludeFilenameRegex:   nil,
    77  		ExcludeFilenameRegex:   nil,
    78  		AllowListExtensions:    nil,
    79  		ExcludeListExtensions:  nil,
    80  		walkMutex:              sync.Mutex{},
    81  		terminateWalking:       false,
    82  		isWalking:              false,
    83  		IgnoreIgnoreFile:       false,
    84  		IgnoreGitIgnore:        false,
    85  		IncludeHidden:          false,
    86  		osOpen:                 os.Open,
    87  		osReadFile:             os.ReadFile,
    88  	}
    89  }
    90  
    91  // NewParallelFileWalker constructs a filewalker, which will walk the supplied directories in parallel
    92  // and output File results to the supplied queue as it finds them
    93  func NewParallelFileWalker(directories []string, fileListQueue chan *File) *FileWalker {
    94  	return &FileWalker{
    95  		fileListQueue:          fileListQueue,
    96  		errorsHandler:          func(e error) bool { return true }, // a generic one that just swallows everything
    97  		directories:            directories,
    98  		LocationExcludePattern: nil,
    99  		IncludeDirectory:       nil,
   100  		ExcludeDirectory:       nil,
   101  		IncludeFilename:        nil,
   102  		ExcludeFilename:        nil,
   103  		IncludeDirectoryRegex:  nil,
   104  		ExcludeDirectoryRegex:  nil,
   105  		IncludeFilenameRegex:   nil,
   106  		ExcludeFilenameRegex:   nil,
   107  		AllowListExtensions:    nil,
   108  		ExcludeListExtensions:  nil,
   109  		walkMutex:              sync.Mutex{},
   110  		terminateWalking:       false,
   111  		isWalking:              false,
   112  		IgnoreIgnoreFile:       false,
   113  		IgnoreGitIgnore:        false,
   114  		IncludeHidden:          false,
   115  		osOpen:                 os.Open,
   116  		osReadFile:             os.ReadFile,
   117  	}
   118  }
   119  
   120  // Walking gets the state of the file walker and determine
   121  // if we are walking or not
   122  func (f *FileWalker) Walking() bool {
   123  	f.walkMutex.Lock()
   124  	defer f.walkMutex.Unlock()
   125  	return f.isWalking
   126  }
   127  
   128  // Terminate have the walker break out of walking and return as
   129  // soon as it possibly can. This is needed because
   130  // this walker needs to work in a TUI interactive mode and
   131  // as such we need to be able to end old processes
   132  func (f *FileWalker) Terminate() {
   133  	f.walkMutex.Lock()
   134  	defer f.walkMutex.Unlock()
   135  	f.terminateWalking = true
   136  }
   137  
   138  // SetErrorHandler sets the function that is called on processing any error
   139  // where if you return true it will attempt to continue processing, and if false
   140  // will return the error instantly
   141  func (f *FileWalker) SetErrorHandler(errors func(error) bool) {
   142  	if errors != nil {
   143  		f.errorsHandler = errors
   144  	}
   145  }
   146  
   147  // Start will start walking the supplied directory with the supplied settings
   148  // and putting files that mach into the supplied channel.
   149  // Returns usual ioutil errors if there is a file issue
   150  // and a ErrTerminateWalk if terminate is called while walking
   151  func (f *FileWalker) Start() error {
   152  	f.walkMutex.Lock()
   153  	f.isWalking = true
   154  	f.walkMutex.Unlock()
   155  
   156  	var err error
   157  	if len(f.directories) != 0 {
   158  		eg := errgroup.Group{}
   159  		for _, directory := range f.directories {
   160  			d := directory // capture var
   161  			eg.Go(func() error {
   162  				return f.walkDirectoryRecursive(d, []gitignore.GitIgnore{}, []gitignore.GitIgnore{})
   163  			})
   164  		}
   165  
   166  		err = eg.Wait()
   167  	} else {
   168  		if f.directory != "" {
   169  			err = f.walkDirectoryRecursive(f.directory, []gitignore.GitIgnore{}, []gitignore.GitIgnore{})
   170  		}
   171  	}
   172  
   173  	close(f.fileListQueue)
   174  
   175  	f.walkMutex.Lock()
   176  	f.isWalking = false
   177  	f.walkMutex.Unlock()
   178  
   179  	return err
   180  }
   181  
   182  func (f *FileWalker) walkDirectoryRecursive(directory string, gitignores []gitignore.GitIgnore, ignores []gitignore.GitIgnore) error {
   183  	// NB have to call unlock not using defer because method is recursive
   184  	// and will deadlock if not done manually
   185  	f.walkMutex.Lock()
   186  	if f.terminateWalking {
   187  		f.walkMutex.Unlock()
   188  		return ErrTerminateWalk
   189  	}
   190  	f.walkMutex.Unlock()
   191  
   192  	d, err := f.osOpen(directory)
   193  	if err != nil {
   194  		// nothing we can do with this so return nil and process as best we can
   195  		if f.errorsHandler(err) {
   196  			return nil
   197  		}
   198  		return err
   199  	}
   200  	defer d.Close()
   201  
   202  	foundFiles, err := d.Readdir(-1)
   203  	if err != nil {
   204  		// nothing we can do with this so return nil and process as best we can
   205  		if f.errorsHandler(err) {
   206  			return nil
   207  		}
   208  		return err
   209  	}
   210  
   211  	files := []os.FileInfo{}
   212  	dirs := []os.FileInfo{}
   213  
   214  	// We want to break apart the files and directories from the
   215  	// return as we loop over them differently and this avoids some
   216  	// nested if logic at the expense of a "redundant" loop
   217  	for _, file := range foundFiles {
   218  		if file.IsDir() {
   219  			dirs = append(dirs, file)
   220  		} else {
   221  			files = append(files, file)
   222  		}
   223  	}
   224  
   225  	// Pull out all ignore and gitignore files and add them
   226  	// to out collection of gitignores to be applied for this pass
   227  	// and any subdirectories
   228  	// Since they can apply to the current list of files we need to ensure
   229  	// we do this before processing files themselves
   230  	for _, file := range files {
   231  		if !f.IgnoreGitIgnore {
   232  			if file.Name() == GitIgnore {
   233  				c, err := f.osReadFile(filepath.Join(directory, file.Name()))
   234  				if err != nil {
   235  					if f.errorsHandler(err) {
   236  						continue // if asked to ignore it lets continue
   237  					}
   238  					return err
   239  				}
   240  
   241  				abs, err := filepath.Abs(directory)
   242  				if err != nil {
   243  					if f.errorsHandler(err) {
   244  						continue // if asked to ignore it lets continue
   245  					}
   246  					return err
   247  				}
   248  
   249  				gitIgnore := gitignore.New(bytes.NewReader(c), abs, nil)
   250  				gitignores = append(gitignores, gitIgnore)
   251  			}
   252  		}
   253  
   254  		if !f.IgnoreIgnoreFile {
   255  			if file.Name() == Ignore {
   256  				c, err := f.osReadFile(filepath.Join(directory, file.Name()))
   257  				if err != nil {
   258  					if f.errorsHandler(err) {
   259  						continue // if asked to ignore it lets continue
   260  					}
   261  					return err
   262  				}
   263  
   264  				abs, err := filepath.Abs(directory)
   265  				if err != nil {
   266  					if f.errorsHandler(err) {
   267  						continue // if asked to ignore it lets continue
   268  					}
   269  					return err
   270  				}
   271  
   272  				gitIgnore := gitignore.New(bytes.NewReader(c), abs, nil)
   273  				ignores = append(ignores, gitIgnore)
   274  			}
   275  		}
   276  	}
   277  
   278  	// Process files first to start feeding whatever process is consuming
   279  	// the output before traversing into directories for more files
   280  	for _, file := range files {
   281  		shouldIgnore := false
   282  		joined := filepath.Join(directory, file.Name())
   283  
   284  		for _, ignore := range gitignores {
   285  			// we have the following situations
   286  			// 1. none of the gitignores match
   287  			// 2. one or more match
   288  			// for #1 this means we should include the file
   289  			// for #2 this means the last one wins since it should be the most correct
   290  			if ignore.MatchIsDir(joined, false) != nil {
   291  				shouldIgnore = ignore.Ignore(joined)
   292  			}
   293  		}
   294  
   295  		for _, ignore := range ignores {
   296  			// same rules as above
   297  			if ignore.MatchIsDir(joined, false) != nil {
   298  				shouldIgnore = ignore.Ignore(joined)
   299  			}
   300  		}
   301  
   302  		if len(f.IncludeFilename) != 0 {
   303  			// include files
   304  			found := false
   305  			for _, allow := range f.IncludeFilename {
   306  				if file.Name() == allow {
   307  					found = true
   308  				}
   309  			}
   310  			if !found {
   311  				shouldIgnore = true
   312  			}
   313  		}
   314  		// Exclude comes after include as it takes precedence
   315  		for _, deny := range f.ExcludeFilename {
   316  			if file.Name() == deny {
   317  				shouldIgnore = true
   318  			}
   319  		}
   320  
   321  		if len(f.IncludeFilenameRegex) != 0 {
   322  			found := false
   323  			for _, allow := range f.IncludeFilenameRegex {
   324  				if allow.Match([]byte(file.Name())) {
   325  					found = true
   326  				}
   327  			}
   328  			if !found {
   329  				shouldIgnore = true
   330  			}
   331  		}
   332  		// Exclude comes after include as it takes precedence
   333  		for _, deny := range f.ExcludeFilenameRegex {
   334  			if deny.Match([]byte(file.Name())) {
   335  				shouldIgnore = true
   336  			}
   337  		}
   338  
   339  		// Ignore hidden files
   340  		if !f.IncludeHidden {
   341  			s, err := IsHidden(file, directory)
   342  			if err != nil {
   343  				if !f.errorsHandler(err) {
   344  					return err
   345  				}
   346  			}
   347  
   348  			if s {
   349  				shouldIgnore = true
   350  			}
   351  		}
   352  
   353  		// Check against extensions
   354  		if len(f.AllowListExtensions) != 0 {
   355  			ext := GetExtension(file.Name())
   356  
   357  			a := false
   358  			for _, v := range f.AllowListExtensions {
   359  				if v == ext {
   360  					a = true
   361  				}
   362  			}
   363  
   364  			// try again because we could have one of those pesky ones such as something.spec.tsx
   365  			// but only if we didn't already find something to save on a bit of processing
   366  			if !a {
   367  				ext = GetExtension(ext)
   368  				for _, v := range f.AllowListExtensions {
   369  					if v == ext {
   370  						a = true
   371  					}
   372  				}
   373  			}
   374  
   375  			if !a {
   376  				shouldIgnore = true
   377  			}
   378  		}
   379  
   380  		for _, deny := range f.ExcludeListExtensions {
   381  			ext := GetExtension(file.Name())
   382  			if ext == deny {
   383  				shouldIgnore = true
   384  			}
   385  
   386  			if !shouldIgnore {
   387  				ext = GetExtension(ext)
   388  				if ext == deny {
   389  					shouldIgnore = true
   390  				}
   391  			}
   392  		}
   393  
   394  		for _, p := range f.LocationExcludePattern {
   395  			if strings.Contains(joined, p) {
   396  				shouldIgnore = true
   397  			}
   398  		}
   399  
   400  		if !shouldIgnore {
   401  			f.fileListQueue <- &File{
   402  				Location: joined,
   403  				Filename: file.Name(),
   404  			}
   405  		}
   406  	}
   407  
   408  	// Now we process the directories after hopefully giving the
   409  	// channel some files to process
   410  	for _, dir := range dirs {
   411  		var shouldIgnore bool
   412  		joined := filepath.Join(directory, dir.Name())
   413  
   414  		// Check against the ignore files we have if the file we are looking at
   415  		// should be ignored
   416  		// It is safe to always call this because the gitignores will not be added
   417  		// in previous steps
   418  		for _, ignore := range gitignores {
   419  			// we have the following situations
   420  			// 1. none of the gitignores match
   421  			// 2. one or more match
   422  			// for #1 this means we should include the file
   423  			// for #2 this means the last one wins since it should be the most correct
   424  			if ignore.MatchIsDir(joined, true) != nil {
   425  				shouldIgnore = ignore.Ignore(joined)
   426  			}
   427  		}
   428  		for _, ignore := range ignores {
   429  			// same rules as above
   430  			if ignore.MatchIsDir(joined, true) != nil {
   431  				shouldIgnore = ignore.Ignore(joined)
   432  			}
   433  		}
   434  
   435  		// start by saying we didn't find it then check each possible
   436  		// choice to see if we did find it
   437  		// if we didn't find it then we should ignore
   438  		if len(f.IncludeDirectory) != 0 {
   439  			found := false
   440  			for _, allow := range f.IncludeDirectory {
   441  				if dir.Name() == allow {
   442  					found = true
   443  				}
   444  			}
   445  			if !found {
   446  				shouldIgnore = true
   447  			}
   448  		}
   449  		// Confirm if there are any files in the path deny list which usually includes
   450  		// things like .git .hg and .svn
   451  		// Comes after include as it takes precedence
   452  		for _, deny := range f.ExcludeDirectory {
   453  			if dir.Name() == deny {
   454  				shouldIgnore = true
   455  			}
   456  		}
   457  
   458  		if len(f.IncludeDirectoryRegex) != 0 {
   459  			found := false
   460  			for _, allow := range f.IncludeDirectoryRegex {
   461  				if allow.Match([]byte(dir.Name())) {
   462  					found = true
   463  				}
   464  			}
   465  			if !found {
   466  				shouldIgnore = true
   467  			}
   468  		}
   469  		// Exclude comes after include as it takes precedence
   470  		for _, deny := range f.ExcludeDirectoryRegex {
   471  			if deny.Match([]byte(dir.Name())) {
   472  				shouldIgnore = true
   473  			}
   474  		}
   475  
   476  		// Ignore hidden directories
   477  		if !f.IncludeHidden {
   478  			s, err := IsHidden(dir, directory)
   479  			if err != nil {
   480  				if !f.errorsHandler(err) {
   481  					return err
   482  				}
   483  			}
   484  
   485  			if s {
   486  				shouldIgnore = true
   487  			}
   488  		}
   489  
   490  		if !shouldIgnore {
   491  			for _, p := range f.LocationExcludePattern {
   492  				if strings.Contains(joined, p) {
   493  					shouldIgnore = true
   494  				}
   495  			}
   496  
   497  			err = f.walkDirectoryRecursive(joined, gitignores, ignores)
   498  			if err != nil {
   499  				return err
   500  			}
   501  		}
   502  	}
   503  
   504  	return nil
   505  }
   506  
   507  // FindRepositoryRoot given the supplied directory backwards looking for .git or .hg
   508  // directories indicating we should start our search from that
   509  // location as it's the root.
   510  // Returns the first directory below supplied with .git or .hg in it
   511  // otherwise the supplied directory
   512  func FindRepositoryRoot(startDirectory string) string {
   513  	// Firstly try to determine our real location
   514  	curdir, err := os.Getwd()
   515  	if err != nil {
   516  		return startDirectory
   517  	}
   518  
   519  	// Check if we have .git or .hg where we are and if
   520  	// so just return because we are already there
   521  	if checkForGitOrMercurial(curdir) {
   522  		return startDirectory
   523  	}
   524  
   525  	// We did not find something, so now we need to walk the file tree
   526  	// backwards in a cross platform way and if we find
   527  	// a match we return that
   528  	lastIndex := strings.LastIndex(curdir, string(os.PathSeparator))
   529  	for lastIndex != -1 {
   530  		curdir = curdir[:lastIndex]
   531  
   532  		if checkForGitOrMercurial(curdir) {
   533  			return curdir
   534  		}
   535  
   536  		lastIndex = strings.LastIndex(curdir, string(os.PathSeparator))
   537  	}
   538  
   539  	// If we didn't find a good match return the supplied directory
   540  	// so that we start the search from where we started at least
   541  	// rather than the root
   542  	return startDirectory
   543  }
   544  
   545  // Check if there is a .git or .hg folder in the supplied directory
   546  func checkForGitOrMercurial(curdir string) bool {
   547  	if stat, err := os.Stat(filepath.Join(curdir, ".git")); err == nil && stat.IsDir() {
   548  		return true
   549  	}
   550  
   551  	if stat, err := os.Stat(filepath.Join(curdir, ".hg")); err == nil && stat.IsDir() {
   552  		return true
   553  	}
   554  
   555  	return false
   556  }
   557  
   558  // GetExtension is a custom version of extracting extensions for a file
   559  // which deals with extensions specific to code such as
   560  // .travis.yml and the like
   561  func GetExtension(name string) string {
   562  	name = strings.ToLower(name)
   563  	if !strings.Contains(name, ".") {
   564  		return name
   565  	}
   566  
   567  	if strings.LastIndex(name, ".") == 0 {
   568  		return name
   569  	}
   570  
   571  	return path.Ext(name)[1:]
   572  }