github.com/lineaje-labs/syft@v0.98.1-0.20231227153149-9e393f60ff1b/syft/internal/fileresolver/directory_indexer.go (about)

     1  package fileresolver
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"io/fs"
     7  	"os"
     8  	"path"
     9  	"path/filepath"
    10  	"strings"
    11  
    12  	"github.com/wagoodman/go-partybus"
    13  	"github.com/wagoodman/go-progress"
    14  
    15  	"github.com/anchore/stereoscope/pkg/file"
    16  	"github.com/anchore/stereoscope/pkg/filetree"
    17  	"github.com/anchore/syft/syft/event"
    18  	"github.com/lineaje-labs/syft/internal"
    19  	"github.com/lineaje-labs/syft/internal/bus"
    20  	"github.com/lineaje-labs/syft/internal/log"
    21  	"github.com/lineaje-labs/syft/syft/internal/windows"
    22  )
    23  
    24  type PathIndexVisitor func(string, os.FileInfo, error) error
    25  
    26  type directoryIndexer struct {
    27  	path              string
    28  	base              string
    29  	pathIndexVisitors []PathIndexVisitor
    30  	errPaths          map[string]error
    31  	tree              filetree.ReadWriter
    32  	index             filetree.Index
    33  }
    34  
    35  func newDirectoryIndexer(path, base string, visitors ...PathIndexVisitor) *directoryIndexer {
    36  	i := &directoryIndexer{
    37  		path:              path,
    38  		base:              base,
    39  		tree:              filetree.New(),
    40  		index:             filetree.NewIndex(),
    41  		pathIndexVisitors: append([]PathIndexVisitor{requireFileInfo, disallowByFileType, disallowUnixSystemRuntimePath}, visitors...),
    42  		errPaths:          make(map[string]error),
    43  	}
    44  
    45  	// these additional stateful visitors should be the first thing considered when walking / indexing
    46  	i.pathIndexVisitors = append(
    47  		[]PathIndexVisitor{
    48  			i.disallowRevisitingVisitor,
    49  			i.disallowFileAccessErr,
    50  		},
    51  		i.pathIndexVisitors...,
    52  	)
    53  
    54  	return i
    55  }
    56  
    57  func (r *directoryIndexer) build() (filetree.Reader, filetree.IndexReader, error) {
    58  	return r.tree, r.index, indexAllRoots(r.path, r.indexTree)
    59  }
    60  
    61  func indexAllRoots(root string, indexer func(string, *progress.Stage) ([]string, error)) error {
    62  	// why account for multiple roots? To cover cases when there is a symlink that references above the root path,
    63  	// in which case we need to additionally index where the link resolves to. it's for this reason why the filetree
    64  	// must be relative to the root of the filesystem (and not just relative to the given path).
    65  	pathsToIndex := []string{root}
    66  	fullPathsMap := map[string]struct{}{}
    67  
    68  	stager, prog := indexingProgress(root)
    69  	defer prog.SetCompleted()
    70  loop:
    71  	for {
    72  		var currentPath string
    73  		switch len(pathsToIndex) {
    74  		case 0:
    75  			break loop
    76  		case 1:
    77  			currentPath, pathsToIndex = pathsToIndex[0], nil
    78  		default:
    79  			currentPath, pathsToIndex = pathsToIndex[0], pathsToIndex[1:]
    80  		}
    81  
    82  		additionalRoots, err := indexer(currentPath, stager)
    83  		if err != nil {
    84  			return fmt.Errorf("unable to index filesystem path=%q: %w", currentPath, err)
    85  		}
    86  
    87  		for _, newRoot := range additionalRoots {
    88  			if _, ok := fullPathsMap[newRoot]; !ok {
    89  				fullPathsMap[newRoot] = struct{}{}
    90  				pathsToIndex = append(pathsToIndex, newRoot)
    91  			}
    92  		}
    93  	}
    94  
    95  	return nil
    96  }
    97  
    98  func (r *directoryIndexer) indexTree(root string, stager *progress.Stage) ([]string, error) {
    99  	log.WithFields("path", root).Trace("indexing filetree")
   100  
   101  	var roots []string
   102  	var err error
   103  
   104  	root, err = filepath.Abs(root)
   105  	if err != nil {
   106  		return nil, err
   107  	}
   108  
   109  	// we want to be able to index single files with the directory resolver. However, we should also allow for attempting
   110  	// to index paths that do not exist (that is, a root that does not exist is not an error case that should stop indexing).
   111  	// For this reason we look for an opportunity to discover if the given root is a file, and if so add a single root,
   112  	// but continue forth with index regardless if the given root path exists or not.
   113  	fi, err := os.Stat(root)
   114  	if err != nil && fi != nil && !fi.IsDir() {
   115  		// note: we want to index the path regardless of an error stat-ing the path
   116  		newRoot, _ := r.indexPath(root, fi, nil)
   117  		if newRoot != "" {
   118  			roots = append(roots, newRoot)
   119  		}
   120  		return roots, nil
   121  	}
   122  
   123  	shouldIndexFullTree, err := isRealPath(root)
   124  	if err != nil {
   125  		return nil, err
   126  	}
   127  
   128  	if !shouldIndexFullTree {
   129  		newRoots, err := r.indexBranch(root, stager)
   130  		if err != nil {
   131  			return nil, fmt.Errorf("unable to index branch=%q: %w", root, err)
   132  		}
   133  
   134  		roots = append(roots, newRoots...)
   135  
   136  		return roots, nil
   137  	}
   138  
   139  	err = filepath.Walk(root,
   140  		func(path string, info os.FileInfo, err error) error {
   141  			stager.Current = path
   142  
   143  			newRoot, err := r.indexPath(path, info, err)
   144  
   145  			if err != nil {
   146  				return err
   147  			}
   148  
   149  			if newRoot != "" {
   150  				roots = append(roots, newRoot)
   151  			}
   152  
   153  			return nil
   154  		})
   155  
   156  	if err != nil {
   157  		return nil, fmt.Errorf("unable to index root=%q: %w", root, err)
   158  	}
   159  
   160  	return roots, nil
   161  }
   162  
   163  func isRealPath(root string) (bool, error) {
   164  	rootParent := filepath.Clean(filepath.Dir(root))
   165  
   166  	realRootParent, err := filepath.EvalSymlinks(rootParent)
   167  	if err != nil {
   168  		return false, err
   169  	}
   170  
   171  	realRootParent = filepath.Clean(realRootParent)
   172  
   173  	return rootParent == realRootParent, nil
   174  }
   175  
   176  func (r *directoryIndexer) indexBranch(root string, stager *progress.Stage) ([]string, error) {
   177  	rootRealPath, err := filepath.EvalSymlinks(root)
   178  	if err != nil {
   179  		return nil, err
   180  	}
   181  
   182  	// there is a symlink within the path to the root, we need to index the real root parent first
   183  	// then capture the symlinks to the root path
   184  	roots, err := r.indexTree(rootRealPath, stager)
   185  	if err != nil {
   186  		return nil, fmt.Errorf("unable to index real root=%q: %w", rootRealPath, err)
   187  	}
   188  
   189  	// walk down all ancestor paths and shallow-add non-existing elements to the tree
   190  	for idx, p := range allContainedPaths(root) {
   191  		var targetPath string
   192  		if idx != 0 {
   193  			parent := path.Dir(p)
   194  			cleanParent, err := filepath.EvalSymlinks(parent)
   195  			if err != nil {
   196  				return nil, fmt.Errorf("unable to evaluate symlink for contained path parent=%q: %w", parent, err)
   197  			}
   198  			targetPath = filepath.Join(cleanParent, filepath.Base(p))
   199  		} else {
   200  			targetPath = p
   201  		}
   202  
   203  		stager.Current = targetPath
   204  
   205  		lstat, err := os.Lstat(targetPath)
   206  		newRoot, err := r.indexPath(targetPath, lstat, err)
   207  		if err != nil && !errors.Is(err, ErrSkipPath) && !errors.Is(err, fs.SkipDir) {
   208  			return nil, fmt.Errorf("unable to index ancestor path=%q: %w", targetPath, err)
   209  		}
   210  		if newRoot != "" {
   211  			roots = append(roots, newRoot)
   212  		}
   213  	}
   214  
   215  	return roots, nil
   216  }
   217  
   218  func allContainedPaths(p string) []string {
   219  	var all []string
   220  	var currentPath string
   221  
   222  	cleanPath := strings.TrimSpace(p)
   223  
   224  	if cleanPath == "" {
   225  		return nil
   226  	}
   227  
   228  	// iterate through all parts of the path, replacing path elements with link resolutions where possible.
   229  	for idx, part := range strings.Split(filepath.Clean(cleanPath), file.DirSeparator) {
   230  		if idx == 0 && part == "" {
   231  			currentPath = file.DirSeparator
   232  			continue
   233  		}
   234  
   235  		// cumulatively gather where we are currently at and provide a rich object
   236  		currentPath = path.Join(currentPath, part)
   237  		all = append(all, currentPath)
   238  	}
   239  	return all
   240  }
   241  
   242  func (r *directoryIndexer) indexPath(path string, info os.FileInfo, err error) (string, error) {
   243  	// ignore any path which a filter function returns true
   244  	for _, filterFn := range r.pathIndexVisitors {
   245  		if filterFn == nil {
   246  			continue
   247  		}
   248  
   249  		if filterErr := filterFn(path, info, err); filterErr != nil {
   250  			if errors.Is(filterErr, fs.SkipDir) {
   251  				// signal to walk() to skip this directory entirely (even if we're processing a file)
   252  				return "", filterErr
   253  			}
   254  			// skip this path but don't affect walk() trajectory
   255  			return "", nil
   256  		}
   257  	}
   258  
   259  	if info == nil {
   260  		// walk may not be able to provide a FileInfo object, don't allow for this to stop indexing; keep track of the paths and continue.
   261  		r.errPaths[path] = fmt.Errorf("no file info observable at path=%q", path)
   262  		return "", nil
   263  	}
   264  
   265  	// here we check to see if we need to normalize paths to posix on the way in coming from windows
   266  	if windows.HostRunningOnWindows() {
   267  		path = windows.ToPosix(path)
   268  	}
   269  
   270  	newRoot, err := r.addPathToIndex(path, info)
   271  	if r.isFileAccessErr(path, err) {
   272  		return "", nil
   273  	}
   274  
   275  	return newRoot, nil
   276  }
   277  
   278  func (r *directoryIndexer) disallowFileAccessErr(path string, _ os.FileInfo, err error) error {
   279  	if r.isFileAccessErr(path, err) {
   280  		return ErrSkipPath
   281  	}
   282  	return nil
   283  }
   284  
   285  func (r *directoryIndexer) isFileAccessErr(path string, err error) bool {
   286  	// don't allow for errors to stop indexing, keep track of the paths and continue.
   287  	if err != nil {
   288  		log.Warnf("unable to access path=%q: %+v", path, err)
   289  		r.errPaths[path] = err
   290  		return true
   291  	}
   292  	return false
   293  }
   294  
   295  func (r directoryIndexer) addPathToIndex(p string, info os.FileInfo) (string, error) {
   296  	switch t := file.TypeFromMode(info.Mode()); t {
   297  	case file.TypeSymLink:
   298  		return r.addSymlinkToIndex(p, info)
   299  	case file.TypeDirectory:
   300  		return "", r.addDirectoryToIndex(p, info)
   301  	case file.TypeRegular:
   302  		return "", r.addFileToIndex(p, info)
   303  	default:
   304  		return "", fmt.Errorf("unsupported file type: %s", t)
   305  	}
   306  }
   307  
   308  func (r directoryIndexer) addDirectoryToIndex(p string, info os.FileInfo) error {
   309  	ref, err := r.tree.AddDir(file.Path(p))
   310  	if err != nil {
   311  		return err
   312  	}
   313  
   314  	metadata := file.NewMetadataFromPath(p, info)
   315  	r.index.Add(*ref, metadata)
   316  
   317  	return nil
   318  }
   319  
   320  func (r directoryIndexer) addFileToIndex(p string, info os.FileInfo) error {
   321  	ref, err := r.tree.AddFile(file.Path(p))
   322  	if err != nil {
   323  		return err
   324  	}
   325  
   326  	metadata := file.NewMetadataFromPath(p, info)
   327  	r.index.Add(*ref, metadata)
   328  
   329  	return nil
   330  }
   331  
   332  func (r directoryIndexer) addSymlinkToIndex(p string, info os.FileInfo) (string, error) {
   333  	linkTarget, err := os.Readlink(p)
   334  	if err != nil {
   335  		isOnWindows := windows.HostRunningOnWindows()
   336  		if isOnWindows {
   337  			p = windows.FromPosix(p)
   338  		}
   339  
   340  		linkTarget, err = filepath.EvalSymlinks(p)
   341  
   342  		if isOnWindows {
   343  			p = windows.ToPosix(p)
   344  		}
   345  
   346  		if err != nil {
   347  			return "", fmt.Errorf("unable to readlink for path=%q: %w", p, err)
   348  		}
   349  	}
   350  
   351  	if filepath.IsAbs(linkTarget) {
   352  		linkTarget = filepath.Clean(linkTarget)
   353  		// if the link is absolute (e.g, /bin/ls -> /bin/busybox) we need to
   354  		// resolve relative to the root of the base directory, if it is not already
   355  		// prefixed with a volume name
   356  		if filepath.VolumeName(linkTarget) == "" {
   357  			linkTarget = filepath.Join(r.base, filepath.Clean(linkTarget))
   358  		}
   359  	} else {
   360  		// if the link is not absolute (e.g, /dev/stderr -> fd/2 ) we need to
   361  		// resolve it relative to the directory in question (e.g. resolve to
   362  		// /dev/fd/2)
   363  		if r.base == "" {
   364  			linkTarget = filepath.Join(filepath.Dir(p), linkTarget)
   365  		} else {
   366  			// if the base is set, then we first need to resolve the link,
   367  			// before finding it's location in the base
   368  			dir, err := filepath.Rel(r.base, filepath.Dir(p))
   369  			if err != nil {
   370  				return "", fmt.Errorf("unable to resolve relative path for path=%q: %w", p, err)
   371  			}
   372  			linkTarget = filepath.Join(r.base, filepath.Clean(filepath.Join("/", dir, linkTarget)))
   373  		}
   374  	}
   375  
   376  	ref, err := r.tree.AddSymLink(file.Path(p), file.Path(linkTarget))
   377  	if err != nil {
   378  		return "", err
   379  	}
   380  
   381  	targetAbsPath := linkTarget
   382  	if !filepath.IsAbs(targetAbsPath) {
   383  		targetAbsPath = filepath.Clean(filepath.Join(path.Dir(p), linkTarget))
   384  	}
   385  
   386  	metadata := file.NewMetadataFromPath(p, info)
   387  	metadata.LinkDestination = linkTarget
   388  	r.index.Add(*ref, metadata)
   389  
   390  	// if the target path does not exist, then do not report it as a new root, or try to send
   391  	// syft parsing there.
   392  	if _, err := os.Stat(targetAbsPath); err != nil && errors.Is(err, os.ErrNotExist) {
   393  		log.Debugf("link %s points to unresolved path %s, ignoring target as new root", p, targetAbsPath)
   394  		targetAbsPath = ""
   395  	}
   396  
   397  	return targetAbsPath, nil
   398  }
   399  
   400  func (r directoryIndexer) hasBeenIndexed(p string) (bool, *file.Metadata) {
   401  	filePath := file.Path(p)
   402  	if !r.tree.HasPath(filePath) {
   403  		return false, nil
   404  	}
   405  
   406  	exists, ref, err := r.tree.File(filePath)
   407  	if err != nil || !exists || !ref.HasReference() {
   408  		return false, nil
   409  	}
   410  
   411  	// cases like "/" will be in the tree, but not been indexed yet (a special case). We want to capture
   412  	// these cases as new paths to index.
   413  	if !ref.HasReference() {
   414  		return false, nil
   415  	}
   416  
   417  	entry, err := r.index.Get(*ref.Reference)
   418  	if err != nil {
   419  		return false, nil
   420  	}
   421  
   422  	return true, &entry.Metadata
   423  }
   424  
   425  func (r *directoryIndexer) disallowRevisitingVisitor(path string, _ os.FileInfo, _ error) error {
   426  	// this prevents visiting:
   427  	// - link destinations twice, once for the real file and another through the virtual path
   428  	// - infinite link cycles
   429  	if indexed, metadata := r.hasBeenIndexed(path); indexed {
   430  		if metadata.IsDir() {
   431  			// signal to walk() that we should skip this directory entirely
   432  			return fs.SkipDir
   433  		}
   434  		return ErrSkipPath
   435  	}
   436  	return nil
   437  }
   438  
   439  func disallowUnixSystemRuntimePath(path string, _ os.FileInfo, _ error) error {
   440  	if internal.HasAnyOfPrefixes(path, unixSystemRuntimePrefixes...) {
   441  		return fs.SkipDir
   442  	}
   443  	return nil
   444  }
   445  
   446  func disallowByFileType(_ string, info os.FileInfo, _ error) error {
   447  	if info == nil {
   448  		// we can't filter out by filetype for non-existent files
   449  		return nil
   450  	}
   451  	switch file.TypeFromMode(info.Mode()) {
   452  	case file.TypeCharacterDevice, file.TypeSocket, file.TypeBlockDevice, file.TypeFIFO, file.TypeIrregular:
   453  		return ErrSkipPath
   454  		// note: symlinks that point to these files may still get by.
   455  		// We handle this later in processing to help prevent against infinite links traversal.
   456  	}
   457  
   458  	return nil
   459  }
   460  
   461  func requireFileInfo(_ string, info os.FileInfo, _ error) error {
   462  	if info == nil {
   463  		return ErrSkipPath
   464  	}
   465  	return nil
   466  }
   467  
   468  func indexingProgress(path string) (*progress.Stage, *progress.Manual) {
   469  	stage := &progress.Stage{}
   470  	prog := progress.NewManual(-1)
   471  
   472  	bus.Publish(partybus.Event{
   473  		Type:   event.FileIndexingStarted,
   474  		Source: path,
   475  		Value: struct {
   476  			progress.Stager
   477  			progress.Progressable
   478  		}{
   479  			Stager:       progress.Stager(stage),
   480  			Progressable: prog,
   481  		},
   482  	})
   483  
   484  	return stage, prog
   485  }