github.com/anchore/syft@v1.4.2-0.20240516191711-1bec1fc5d397/syft/pkg/cataloger/python/parse_wheel_egg_metadata.go (about)

     1  package python
     2  
     3  import (
     4  	"bufio"
     5  	"fmt"
     6  	"io"
     7  	"path/filepath"
     8  	"strings"
     9  
    10  	"github.com/mitchellh/mapstructure"
    11  
    12  	intFile "github.com/anchore/syft/internal/file"
    13  	"github.com/anchore/syft/internal/log"
    14  	"github.com/anchore/syft/syft/file"
    15  	"github.com/anchore/syft/syft/pkg"
    16  )
    17  
    18  type parsedData struct {
    19  	Licenses          string `mapstructure:"License"`
    20  	LicenseFile       string `mapstructure:"LicenseFile"`
    21  	LicenseExpression string `mapstructure:"LicenseExpression"`
    22  	LicenseLocation   file.Location
    23  	pkg.PythonPackage `mapstructure:",squash"`
    24  }
    25  
    26  // parseWheelOrEggMetadata takes a Python Egg or Wheel (which share the same format and values for our purposes),
    27  // returning all Python packages listed.
    28  func parseWheelOrEggMetadata(path string, reader io.Reader) (parsedData, error) {
    29  	fields := make(map[string]string)
    30  	var key string
    31  
    32  	scanner := bufio.NewScanner(reader)
    33  	for scanner.Scan() {
    34  		line := scanner.Text()
    35  		line = strings.TrimRight(line, "\n")
    36  
    37  		// An empty line means we are done parsing (either because there's no more data,
    38  		// or because a description follows as specified in
    39  		// https://packaging.python.org/specifications/core-metadata/#description;
    40  		// and at this time, we're not interested in the description).
    41  		if len(line) == 0 {
    42  			if len(fields) > 0 {
    43  				break
    44  			}
    45  
    46  			// however, if the field parsing has not started yet, keep scanning lines
    47  			continue
    48  		}
    49  
    50  		switch {
    51  		case strings.HasPrefix(line, " "):
    52  			// a field-body continuation
    53  			updatedValue, err := handleFieldBodyContinuation(key, line, fields)
    54  			if err != nil {
    55  				return parsedData{}, err
    56  			}
    57  
    58  			fields[key] = updatedValue
    59  		default:
    60  			// parse a new key (note, duplicate keys are overridden)
    61  			if i := strings.Index(line, ":"); i > 0 {
    62  				// mapstruct cannot map keys with dashes, and we are expected to persist the "Author-email" field
    63  				key = strings.ReplaceAll(strings.TrimSpace(line[0:i]), "-", "")
    64  				val := strings.TrimSpace(line[i+1:])
    65  
    66  				fields[key] = val
    67  			} else {
    68  				log.Warnf("cannot parse field from path: %q from line: %q", path, line)
    69  			}
    70  		}
    71  	}
    72  
    73  	if err := scanner.Err(); err != nil {
    74  		return parsedData{}, fmt.Errorf("failed to parse python wheel/egg: %w", err)
    75  	}
    76  
    77  	var pd parsedData
    78  	if err := mapstructure.Decode(fields, &pd); err != nil {
    79  		return pd, fmt.Errorf("unable to parse APK metadata: %w", err)
    80  	}
    81  
    82  	// add additional metadata not stored in the egg/wheel metadata file
    83  
    84  	pd.SitePackagesRootPath = determineSitePackagesRootPath(path)
    85  	if pd.Licenses != "" || pd.LicenseExpression != "" {
    86  		pd.LicenseLocation = file.NewLocation(path)
    87  	} else if pd.LicenseFile != "" {
    88  		pd.LicenseLocation = file.NewLocation(filepath.Join(filepath.Dir(path), pd.LicenseFile))
    89  	}
    90  
    91  	return pd, nil
    92  }
    93  
    94  // isEggRegularFile determines if the specified path is the regular file variant
    95  // of egg metadata (as opposed to a directory that contains more metadata
    96  // files).
    97  func isEggRegularFile(path string) bool {
    98  	return intFile.GlobMatch(eggInfoGlob, path)
    99  }
   100  
   101  // determineSitePackagesRootPath returns the path of the site packages root,
   102  // given the egg metadata file or directory specified in the path.
   103  func determineSitePackagesRootPath(path string) string {
   104  	if isEggRegularFile(path) {
   105  		return filepath.Clean(filepath.Dir(path))
   106  	}
   107  
   108  	return filepath.Clean(filepath.Dir(filepath.Dir(path)))
   109  }
   110  
   111  // handleFieldBodyContinuation returns the updated value for the specified field after processing the specified line.
   112  // If the continuation cannot be processed, it returns an error.
   113  func handleFieldBodyContinuation(key, line string, fields map[string]string) (string, error) {
   114  	if len(key) == 0 {
   115  		return "", fmt.Errorf("no match for continuation: line: '%s'", line)
   116  	}
   117  
   118  	val, ok := fields[key]
   119  	if !ok {
   120  		return "", fmt.Errorf("no previous key exists, expecting: %s", key)
   121  	}
   122  
   123  	// concatenate onto previous value
   124  	return fmt.Sprintf("%s\n %s", val, strings.TrimSpace(line)), nil
   125  }