github.com/anchore/syft@v1.38.2/syft/pkg/cataloger/python/parse_wheel_egg_metadata.go (about)

     1  package python
     2  
     3  import (
     4  	"bufio"
     5  	"fmt"
     6  	"path/filepath"
     7  	"strings"
     8  
     9  	"github.com/bmatcuk/doublestar/v4"
    10  	"github.com/go-viper/mapstructure/v2"
    11  
    12  	"github.com/anchore/syft/internal/log"
    13  	"github.com/anchore/syft/syft/file"
    14  	"github.com/anchore/syft/syft/pkg"
    15  )
    16  
    17  type parsedData struct {
    18  	// core info
    19  
    20  	// DistInfoLocation is the location of the METADATA file within the .dist-info directory where we obtained the python package information
    21  	DistInfoLocation  file.Location
    22  	pkg.PythonPackage `mapstructure:",squash"`
    23  
    24  	// license info
    25  
    26  	Licenses          string `mapstructure:"License"`
    27  	LicenseFile       string `mapstructure:"LicenseFile"`
    28  	LicenseExpression string `mapstructure:"LicenseExpression"`
    29  	LicenseFilePath   string
    30  }
    31  
    32  var pluralFields = map[string]bool{
    33  	"ProvidesExtra": true,
    34  	"RequiresDist":  true,
    35  }
    36  
    37  // parseWheelOrEggMetadata takes a Python Egg or Wheel (which share the same format and values for our purposes),
    38  // returning all Python packages listed.
    39  func parseWheelOrEggMetadata(locationReader file.LocationReadCloser) (parsedData, error) {
    40  	fields, err := extractRFC5322Fields(locationReader)
    41  	if err != nil {
    42  		return parsedData{}, fmt.Errorf("unable to extract python wheel/egg metadata: %w", err)
    43  	}
    44  
    45  	var pd parsedData
    46  	if err := mapstructure.Decode(fields, &pd); err != nil {
    47  		return pd, fmt.Errorf("unable to translate python wheel/egg metadata: %w", err)
    48  	}
    49  
    50  	// add additional metadata not stored in the egg/wheel metadata file
    51  	path := locationReader.Path()
    52  
    53  	pd.SitePackagesRootPath = determineSitePackagesRootPath(path)
    54  	if pd.Licenses != "" || pd.LicenseExpression != "" {
    55  		pd.LicenseFilePath = path
    56  	} else if pd.LicenseFile != "" {
    57  		pd.LicenseFilePath = filepath.Join(filepath.Dir(path), pd.LicenseFile)
    58  	}
    59  
    60  	pd.DistInfoLocation = locationReader.Location
    61  
    62  	return pd, nil
    63  }
    64  
    65  func extractRFC5322Fields(locationReader file.LocationReadCloser) (map[string]any, error) {
    66  	fields := make(map[string]any)
    67  	var key string
    68  
    69  	// though this spec is governed by RFC 5322 (mail message), the metadata files are not guaranteed to be compliant.
    70  	// We must survive parsing as much info as possible without failing and dropping the data.
    71  	scanner := bufio.NewScanner(locationReader)
    72  	for scanner.Scan() {
    73  		line := scanner.Text()
    74  		line = strings.TrimRight(line, "\n")
    75  
    76  		// An empty line means we are done parsing (either because there's no more data,
    77  		// or because a description follows as specified in
    78  		// https://packaging.python.org/specifications/core-metadata/#description;
    79  		// and at this time, we're not interested in the description).
    80  		if len(line) == 0 {
    81  			if len(fields) > 0 {
    82  				break
    83  			}
    84  
    85  			// however, if the field parsing has not started yet, keep scanning lines
    86  			continue
    87  		}
    88  
    89  		switch {
    90  		case strings.HasPrefix(line, " "):
    91  			// a field-body continuation
    92  			updatedValue, err := handleFieldBodyContinuation(key, line, fields)
    93  			if err != nil {
    94  				return nil, err
    95  			}
    96  
    97  			fields[key] = updatedValue
    98  		default:
    99  			// parse a new key (note, duplicate keys that are for singular fields are overridden, where as plural fields are appended)
   100  			if i := strings.Index(line, ":"); i > 0 {
   101  				// mapstruct cannot map keys with dashes, and we are expected to persist the "Author-email" field
   102  				key = strings.ReplaceAll(strings.TrimSpace(line[0:i]), "-", "")
   103  				val := getFieldType(key, strings.TrimSpace(line[i+1:]))
   104  
   105  				fields[key] = handleSingleOrMultiField(fields[key], val)
   106  			} else {
   107  				log.Debugf("cannot parse field from path: %q from line: %q", locationReader.Path(), line)
   108  			}
   109  		}
   110  	}
   111  	return fields, nil
   112  }
   113  
   114  func handleSingleOrMultiField(existingValue, val any) any {
   115  	strSlice, ok := val.([]string)
   116  	if !ok {
   117  		return val
   118  	}
   119  	if existingValue == nil {
   120  		return strSlice
   121  	}
   122  
   123  	switch existingValueTy := existingValue.(type) {
   124  	case []string:
   125  		return append(existingValueTy, strSlice...)
   126  	case string:
   127  		return append([]string{existingValueTy}, strSlice...)
   128  	}
   129  
   130  	return append([]string{fmt.Sprintf("%s", existingValue)}, strSlice...)
   131  }
   132  
   133  func getFieldType(key, in string) any {
   134  	if plural, ok := pluralFields[key]; ok && plural {
   135  		return []string{in}
   136  	}
   137  	return in
   138  }
   139  
   140  // isEggRegularFile determines if the specified path is the regular file variant
   141  // of egg metadata (as opposed to a directory that contains more metadata
   142  // files).
   143  func isEggRegularFile(path string) bool {
   144  	return doublestar.MatchUnvalidated(eggInfoGlob, path)
   145  }
   146  
   147  // determineSitePackagesRootPath returns the path of the site packages root,
   148  // given the egg metadata file or directory specified in the path.
   149  func determineSitePackagesRootPath(path string) string {
   150  	if isEggRegularFile(path) {
   151  		return filepath.Clean(filepath.Dir(path))
   152  	}
   153  
   154  	return filepath.Clean(filepath.Dir(filepath.Dir(path)))
   155  }
   156  
   157  // handleFieldBodyContinuation returns the updated value for the specified field after processing the specified line.
   158  // If the continuation cannot be processed, it returns an error.
   159  func handleFieldBodyContinuation(key, line string, fields map[string]any) (any, error) {
   160  	if len(key) == 0 {
   161  		return "", fmt.Errorf("no match for continuation: line: '%s'", line)
   162  	}
   163  
   164  	val, ok := fields[key]
   165  	if !ok {
   166  		return "", fmt.Errorf("no previous key exists, expecting: %s", key)
   167  	}
   168  
   169  	// concatenate onto previous value
   170  	switch s := val.(type) {
   171  	case string:
   172  		return fmt.Sprintf("%s\n %s", s, strings.TrimSpace(line)), nil
   173  	case []string:
   174  		if len(s) == 0 {
   175  			s = append(s, "")
   176  		}
   177  		s[len(s)-1] = fmt.Sprintf("%s\n %s", s[len(s)-1], strings.TrimSpace(line))
   178  		return s, nil
   179  	default:
   180  		return "", fmt.Errorf("unexpected type for continuation: %T", val)
   181  	}
   182  }