github.com/kastenhq/syft@v0.0.0-20230821225854-0710af25cdbe/syft/internal/fileresolver/directory_indexer.go (about)

     1  package fileresolver
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"io/fs"
     7  	"os"
     8  	"path"
     9  	"path/filepath"
    10  	"strings"
    11  
    12  	"github.com/wagoodman/go-partybus"
    13  	"github.com/wagoodman/go-progress"
    14  
    15  	"github.com/anchore/stereoscope/pkg/file"
    16  	"github.com/anchore/stereoscope/pkg/filetree"
    17  	"github.com/kastenhq/syft/internal"
    18  	"github.com/kastenhq/syft/internal/bus"
    19  	"github.com/kastenhq/syft/internal/log"
    20  	"github.com/kastenhq/syft/syft/event"
    21  	"github.com/kastenhq/syft/syft/internal/windows"
    22  )
    23  
    24  type PathIndexVisitor func(string, os.FileInfo, error) error
    25  
    26  type directoryIndexer struct {
    27  	path              string
    28  	base              string
    29  	pathIndexVisitors []PathIndexVisitor
    30  	errPaths          map[string]error
    31  	tree              filetree.ReadWriter
    32  	index             filetree.Index
    33  }
    34  
    35  func newDirectoryIndexer(path, base string, visitors ...PathIndexVisitor) *directoryIndexer {
    36  	i := &directoryIndexer{
    37  		path:              path,
    38  		base:              base,
    39  		tree:              filetree.New(),
    40  		index:             filetree.NewIndex(),
    41  		pathIndexVisitors: append([]PathIndexVisitor{requireFileInfo, disallowByFileType, disallowUnixSystemRuntimePath}, visitors...),
    42  		errPaths:          make(map[string]error),
    43  	}
    44  
    45  	// these additional stateful visitors should be the first thing considered when walking / indexing
    46  	i.pathIndexVisitors = append(
    47  		[]PathIndexVisitor{
    48  			i.disallowRevisitingVisitor,
    49  			i.disallowFileAccessErr,
    50  		},
    51  		i.pathIndexVisitors...,
    52  	)
    53  
    54  	return i
    55  }
    56  
    57  func (r *directoryIndexer) build() (filetree.Reader, filetree.IndexReader, error) {
    58  	return r.tree, r.index, indexAllRoots(r.path, r.indexTree)
    59  }
    60  
    61  func indexAllRoots(root string, indexer func(string, *progress.Stage) ([]string, error)) error {
    62  	// why account for multiple roots? To cover cases when there is a symlink that references above the root path,
    63  	// in which case we need to additionally index where the link resolves to. it's for this reason why the filetree
    64  	// must be relative to the root of the filesystem (and not just relative to the given path).
    65  	pathsToIndex := []string{root}
    66  	fullPathsMap := map[string]struct{}{}
    67  
    68  	stager, prog := indexingProgress(root)
    69  	defer prog.SetCompleted()
    70  loop:
    71  	for {
    72  		var currentPath string
    73  		switch len(pathsToIndex) {
    74  		case 0:
    75  			break loop
    76  		case 1:
    77  			currentPath, pathsToIndex = pathsToIndex[0], nil
    78  		default:
    79  			currentPath, pathsToIndex = pathsToIndex[0], pathsToIndex[1:]
    80  		}
    81  
    82  		additionalRoots, err := indexer(currentPath, stager)
    83  		if err != nil {
    84  			return fmt.Errorf("unable to index filesystem path=%q: %w", currentPath, err)
    85  		}
    86  
    87  		for _, newRoot := range additionalRoots {
    88  			if _, ok := fullPathsMap[newRoot]; !ok {
    89  				fullPathsMap[newRoot] = struct{}{}
    90  				pathsToIndex = append(pathsToIndex, newRoot)
    91  			}
    92  		}
    93  	}
    94  
    95  	return nil
    96  }
    97  
    98  func (r *directoryIndexer) indexTree(root string, stager *progress.Stage) ([]string, error) {
    99  	log.WithFields("path", root).Trace("indexing filetree")
   100  
   101  	var roots []string
   102  	var err error
   103  
   104  	root, err = filepath.Abs(root)
   105  	if err != nil {
   106  		return nil, err
   107  	}
   108  
   109  	// we want to be able to index single files with the directory resolver. However, we should also allow for attempting
   110  	// to index paths that do not exist (that is, a root that does not exist is not an error case that should stop indexing).
   111  	// For this reason we look for an opportunity to discover if the given root is a file, and if so add a single root,
   112  	// but continue forth with index regardless if the given root path exists or not.
   113  	fi, err := os.Stat(root)
   114  	if err != nil && fi != nil && !fi.IsDir() {
   115  		// note: we want to index the path regardless of an error stat-ing the path
   116  		newRoot, _ := r.indexPath(root, fi, nil)
   117  		if newRoot != "" {
   118  			roots = append(roots, newRoot)
   119  		}
   120  		return roots, nil
   121  	}
   122  
   123  	shouldIndexFullTree, err := isRealPath(root)
   124  	if err != nil {
   125  		return nil, err
   126  	}
   127  
   128  	if !shouldIndexFullTree {
   129  		newRoots, err := r.indexBranch(root, stager)
   130  		if err != nil {
   131  			return nil, fmt.Errorf("unable to index branch=%q: %w", root, err)
   132  		}
   133  
   134  		roots = append(roots, newRoots...)
   135  
   136  		return roots, nil
   137  	}
   138  
   139  	err = filepath.Walk(root,
   140  		func(path string, info os.FileInfo, err error) error {
   141  			stager.Current = path
   142  
   143  			newRoot, err := r.indexPath(path, info, err)
   144  
   145  			if err != nil {
   146  				return err
   147  			}
   148  
   149  			if newRoot != "" {
   150  				roots = append(roots, newRoot)
   151  			}
   152  
   153  			return nil
   154  		})
   155  
   156  	if err != nil {
   157  		return nil, fmt.Errorf("unable to index root=%q: %w", root, err)
   158  	}
   159  
   160  	return roots, nil
   161  }
   162  
   163  func isRealPath(root string) (bool, error) {
   164  	rootParent := filepath.Clean(filepath.Dir(root))
   165  
   166  	realRootParent, err := filepath.EvalSymlinks(rootParent)
   167  	if err != nil {
   168  		return false, err
   169  	}
   170  
   171  	realRootParent = filepath.Clean(realRootParent)
   172  
   173  	return rootParent == realRootParent, nil
   174  }
   175  
   176  func (r *directoryIndexer) indexBranch(root string, stager *progress.Stage) ([]string, error) {
   177  	rootRealPath, err := filepath.EvalSymlinks(root)
   178  	if err != nil {
   179  		return nil, err
   180  	}
   181  
   182  	// there is a symlink within the path to the root, we need to index the real root parent first
   183  	// then capture the symlinks to the root path
   184  	roots, err := r.indexTree(rootRealPath, stager)
   185  	if err != nil {
   186  		return nil, fmt.Errorf("unable to index real root=%q: %w", rootRealPath, err)
   187  	}
   188  
   189  	// walk down all ancestor paths and shallow-add non-existing elements to the tree
   190  	for idx, p := range allContainedPaths(root) {
   191  		var targetPath string
   192  		if idx != 0 {
   193  			parent := path.Dir(p)
   194  			cleanParent, err := filepath.EvalSymlinks(parent)
   195  			if err != nil {
   196  				return nil, fmt.Errorf("unable to evaluate symlink for contained path parent=%q: %w", parent, err)
   197  			}
   198  			targetPath = filepath.Join(cleanParent, filepath.Base(p))
   199  		} else {
   200  			targetPath = p
   201  		}
   202  
   203  		stager.Current = targetPath
   204  
   205  		lstat, err := os.Lstat(targetPath)
   206  		newRoot, err := r.indexPath(targetPath, lstat, err)
   207  		if err != nil && !errors.Is(err, ErrSkipPath) && !errors.Is(err, fs.SkipDir) {
   208  			return nil, fmt.Errorf("unable to index ancestor path=%q: %w", targetPath, err)
   209  		}
   210  		if newRoot != "" {
   211  			roots = append(roots, newRoot)
   212  		}
   213  	}
   214  
   215  	return roots, nil
   216  }
   217  
   218  func allContainedPaths(p string) []string {
   219  	var all []string
   220  	var currentPath string
   221  
   222  	cleanPath := strings.TrimSpace(p)
   223  
   224  	if cleanPath == "" {
   225  		return nil
   226  	}
   227  
   228  	// iterate through all parts of the path, replacing path elements with link resolutions where possible.
   229  	for idx, part := range strings.Split(filepath.Clean(cleanPath), file.DirSeparator) {
   230  		if idx == 0 && part == "" {
   231  			currentPath = file.DirSeparator
   232  			continue
   233  		}
   234  
   235  		// cumulatively gather where we are currently at and provide a rich object
   236  		currentPath = path.Join(currentPath, part)
   237  		all = append(all, currentPath)
   238  	}
   239  	return all
   240  }
   241  
   242  func (r *directoryIndexer) indexPath(path string, info os.FileInfo, err error) (string, error) {
   243  	// ignore any path which a filter function returns true
   244  	for _, filterFn := range r.pathIndexVisitors {
   245  		if filterFn == nil {
   246  			continue
   247  		}
   248  
   249  		if filterErr := filterFn(path, info, err); filterErr != nil {
   250  			if errors.Is(filterErr, fs.SkipDir) {
   251  				// signal to walk() to skip this directory entirely (even if we're processing a file)
   252  				return "", filterErr
   253  			}
   254  			// skip this path but don't affect walk() trajectory
   255  			return "", nil
   256  		}
   257  	}
   258  
   259  	if info == nil {
   260  		// walk may not be able to provide a FileInfo object, don't allow for this to stop indexing; keep track of the paths and continue.
   261  		r.errPaths[path] = fmt.Errorf("no file info observable at path=%q", path)
   262  		return "", nil
   263  	}
   264  
   265  	// here we check to see if we need to normalize paths to posix on the way in coming from windows
   266  	if windows.HostRunningOnWindows() {
   267  		path = windows.ToPosix(path)
   268  	}
   269  
   270  	newRoot, err := r.addPathToIndex(path, info)
   271  	if r.isFileAccessErr(path, err) {
   272  		return "", nil
   273  	}
   274  
   275  	return newRoot, nil
   276  }
   277  
   278  func (r *directoryIndexer) disallowFileAccessErr(path string, _ os.FileInfo, err error) error {
   279  	if r.isFileAccessErr(path, err) {
   280  		return ErrSkipPath
   281  	}
   282  	return nil
   283  }
   284  
   285  func (r *directoryIndexer) isFileAccessErr(path string, err error) bool {
   286  	// don't allow for errors to stop indexing, keep track of the paths and continue.
   287  	if err != nil {
   288  		log.Warnf("unable to access path=%q: %+v", path, err)
   289  		r.errPaths[path] = err
   290  		return true
   291  	}
   292  	return false
   293  }
   294  
   295  func (r directoryIndexer) addPathToIndex(p string, info os.FileInfo) (string, error) {
   296  	switch t := file.TypeFromMode(info.Mode()); t {
   297  	case file.TypeSymLink:
   298  		return r.addSymlinkToIndex(p, info)
   299  	case file.TypeDirectory:
   300  		return "", r.addDirectoryToIndex(p, info)
   301  	case file.TypeRegular:
   302  		return "", r.addFileToIndex(p, info)
   303  	default:
   304  		return "", fmt.Errorf("unsupported file type: %s", t)
   305  	}
   306  }
   307  
   308  func (r directoryIndexer) addDirectoryToIndex(p string, info os.FileInfo) error {
   309  	ref, err := r.tree.AddDir(file.Path(p))
   310  	if err != nil {
   311  		return err
   312  	}
   313  
   314  	metadata := file.NewMetadataFromPath(p, info)
   315  	r.index.Add(*ref, metadata)
   316  
   317  	return nil
   318  }
   319  
   320  func (r directoryIndexer) addFileToIndex(p string, info os.FileInfo) error {
   321  	ref, err := r.tree.AddFile(file.Path(p))
   322  	if err != nil {
   323  		return err
   324  	}
   325  
   326  	metadata := file.NewMetadataFromPath(p, info)
   327  	r.index.Add(*ref, metadata)
   328  
   329  	return nil
   330  }
   331  
   332  func (r directoryIndexer) addSymlinkToIndex(p string, info os.FileInfo) (string, error) {
   333  	linkTarget, err := os.Readlink(p)
   334  	if err != nil {
   335  		isOnWindows := windows.HostRunningOnWindows()
   336  		if isOnWindows {
   337  			p = windows.FromPosix(p)
   338  		}
   339  
   340  		linkTarget, err = filepath.EvalSymlinks(p)
   341  
   342  		if isOnWindows {
   343  			p = windows.ToPosix(p)
   344  		}
   345  
   346  		if err != nil {
   347  			return "", fmt.Errorf("unable to readlink for path=%q: %w", p, err)
   348  		}
   349  	}
   350  
   351  	if filepath.IsAbs(linkTarget) {
   352  		// if the link is absolute (e.g, /bin/ls -> /bin/busybox) we need to
   353  		// resolve relative to the root of the base directory
   354  		linkTarget = filepath.Join(r.base, filepath.Clean(linkTarget))
   355  	} else {
   356  		// if the link is not absolute (e.g, /dev/stderr -> fd/2 ) we need to
   357  		// resolve it relative to the directory in question (e.g. resolve to
   358  		// /dev/fd/2)
   359  		if r.base == "" {
   360  			linkTarget = filepath.Join(filepath.Dir(p), linkTarget)
   361  		} else {
   362  			// if the base is set, then we first need to resolve the link,
   363  			// before finding it's location in the base
   364  			dir, err := filepath.Rel(r.base, filepath.Dir(p))
   365  			if err != nil {
   366  				return "", fmt.Errorf("unable to resolve relative path for path=%q: %w", p, err)
   367  			}
   368  			linkTarget = filepath.Join(r.base, filepath.Clean(filepath.Join("/", dir, linkTarget)))
   369  		}
   370  	}
   371  
   372  	ref, err := r.tree.AddSymLink(file.Path(p), file.Path(linkTarget))
   373  	if err != nil {
   374  		return "", err
   375  	}
   376  
   377  	targetAbsPath := linkTarget
   378  	if !filepath.IsAbs(targetAbsPath) {
   379  		targetAbsPath = filepath.Clean(filepath.Join(path.Dir(p), linkTarget))
   380  	}
   381  
   382  	metadata := file.NewMetadataFromPath(p, info)
   383  	metadata.LinkDestination = linkTarget
   384  	r.index.Add(*ref, metadata)
   385  
   386  	// if the target path does not exist, then do not report it as a new root, or try to send
   387  	// syft parsing there.
   388  	if _, err := os.Stat(targetAbsPath); err != nil && errors.Is(err, os.ErrNotExist) {
   389  		log.Debugf("link %s points to unresolved path %s, ignoring target as new root", p, targetAbsPath)
   390  		targetAbsPath = ""
   391  	}
   392  
   393  	return targetAbsPath, nil
   394  }
   395  
   396  func (r directoryIndexer) hasBeenIndexed(p string) (bool, *file.Metadata) {
   397  	filePath := file.Path(p)
   398  	if !r.tree.HasPath(filePath) {
   399  		return false, nil
   400  	}
   401  
   402  	exists, ref, err := r.tree.File(filePath)
   403  	if err != nil || !exists || !ref.HasReference() {
   404  		return false, nil
   405  	}
   406  
   407  	// cases like "/" will be in the tree, but not been indexed yet (a special case). We want to capture
   408  	// these cases as new paths to index.
   409  	if !ref.HasReference() {
   410  		return false, nil
   411  	}
   412  
   413  	entry, err := r.index.Get(*ref.Reference)
   414  	if err != nil {
   415  		return false, nil
   416  	}
   417  
   418  	return true, &entry.Metadata
   419  }
   420  
   421  func (r *directoryIndexer) disallowRevisitingVisitor(path string, _ os.FileInfo, _ error) error {
   422  	// this prevents visiting:
   423  	// - link destinations twice, once for the real file and another through the virtual path
   424  	// - infinite link cycles
   425  	if indexed, metadata := r.hasBeenIndexed(path); indexed {
   426  		if metadata.IsDir() {
   427  			// signal to walk() that we should skip this directory entirely
   428  			return fs.SkipDir
   429  		}
   430  		return ErrSkipPath
   431  	}
   432  	return nil
   433  }
   434  
   435  func disallowUnixSystemRuntimePath(path string, _ os.FileInfo, _ error) error {
   436  	if internal.HasAnyOfPrefixes(path, unixSystemRuntimePrefixes...) {
   437  		return fs.SkipDir
   438  	}
   439  	return nil
   440  }
   441  
   442  func disallowByFileType(_ string, info os.FileInfo, _ error) error {
   443  	if info == nil {
   444  		// we can't filter out by filetype for non-existent files
   445  		return nil
   446  	}
   447  	switch file.TypeFromMode(info.Mode()) {
   448  	case file.TypeCharacterDevice, file.TypeSocket, file.TypeBlockDevice, file.TypeFIFO, file.TypeIrregular:
   449  		return ErrSkipPath
   450  		// note: symlinks that point to these files may still get by.
   451  		// We handle this later in processing to help prevent against infinite links traversal.
   452  	}
   453  
   454  	return nil
   455  }
   456  
   457  func requireFileInfo(_ string, info os.FileInfo, _ error) error {
   458  	if info == nil {
   459  		return ErrSkipPath
   460  	}
   461  	return nil
   462  }
   463  
   464  func indexingProgress(path string) (*progress.Stage, *progress.Manual) {
   465  	stage := &progress.Stage{}
   466  	prog := progress.NewManual(-1)
   467  
   468  	bus.Publish(partybus.Event{
   469  		Type:   event.FileIndexingStarted,
   470  		Source: path,
   471  		Value: struct {
   472  			progress.Stager
   473  			progress.Progressable
   474  		}{
   475  			Stager:       progress.Stager(stage),
   476  			Progressable: prog,
   477  		},
   478  	})
   479  
   480  	return stage, prog
   481  }