github.com/nextlinux/gosbom@v0.81.1-0.20230627115839-1ff50c281391/gosbom/pkg/cataloger/python/parse_requirements.go (about)

     1  package python
     2  
     3  import (
     4  	"bufio"
     5  	"fmt"
     6  	"regexp"
     7  	"strings"
     8  	"unicode"
     9  
    10  	"github.com/nextlinux/gosbom/gosbom/artifact"
    11  	"github.com/nextlinux/gosbom/gosbom/file"
    12  	"github.com/nextlinux/gosbom/gosbom/pkg"
    13  	"github.com/nextlinux/gosbom/gosbom/pkg/cataloger/generic"
    14  	"github.com/nextlinux/gosbom/internal/log"
    15  )
    16  
    17  var _ generic.Parser = parseRequirementsTxt
    18  
    19  var (
    20  	extrasRegex = regexp.MustCompile(`\[.*\]`)
    21  	urlRegex    = regexp.MustCompile("@.*git.*")
    22  )
    23  
    24  // parseRequirementsTxt takes a Python requirements.txt file, returning all Python packages that are locked to a
    25  // specific version.
    26  func parseRequirementsTxt(_ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) {
    27  	var packages []pkg.Package
    28  
    29  	scanner := bufio.NewScanner(reader)
    30  	for scanner.Scan() {
    31  		line := scanner.Text()
    32  		rawLineNoComments := removeTrailingComment(line)
    33  		line = trimRequirementsTxtLine(line)
    34  
    35  		if line == "" {
    36  			// nothing to parse on this line
    37  			continue
    38  		}
    39  
    40  		if strings.HasPrefix(line, "-e") {
    41  			// editable packages aren't parsed (yet)
    42  			continue
    43  		}
    44  
    45  		if !strings.Contains(line, "==") {
    46  			// a package without a version, or a range (unpinned) which does not tell us
    47  			// exactly what will be installed.
    48  			continue
    49  		}
    50  
    51  		// parse a new requirement
    52  		parts := strings.Split(line, "==")
    53  		if len(parts) < 2 {
    54  			// this should never happen, but just in case
    55  			log.WithFields("path", reader.RealPath).Warnf("unable to parse requirements.txt line: %q", line)
    56  			continue
    57  		}
    58  
    59  		// check if the version contains hash declarations on the same line
    60  		version, _ := parseVersionAndHashes(parts[1])
    61  
    62  		name := strings.TrimSpace(parts[0])
    63  		version = strings.TrimFunc(version, func(r rune) bool {
    64  			return !unicode.IsLetter(r) && !unicode.IsNumber(r)
    65  		})
    66  
    67  		// TODO: Update to support more than only ==
    68  		versionConstraint := fmt.Sprintf("== %s", version)
    69  
    70  		if name == "" || version == "" {
    71  			log.WithFields("path", reader.RealPath).Debugf("found empty package in requirements.txt line: %q", line)
    72  			continue
    73  		}
    74  		packages = append(
    75  			packages,
    76  			newPackageForRequirementsWithMetadata(
    77  				name,
    78  				version,
    79  				pkg.PythonRequirementsMetadata{
    80  					Name:              name,
    81  					Extras:            parseExtras(rawLineNoComments),
    82  					VersionConstraint: versionConstraint,
    83  					URL:               parseURL(rawLineNoComments),
    84  					Markers:           parseMarkers(rawLineNoComments),
    85  				},
    86  				reader.Location.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation),
    87  			),
    88  		)
    89  	}
    90  
    91  	if err := scanner.Err(); err != nil {
    92  		return nil, nil, fmt.Errorf("failed to parse python requirements file: %w", err)
    93  	}
    94  
    95  	return packages, nil, nil
    96  }
    97  
    98  func parseVersionAndHashes(version string) (string, []string) {
    99  	parts := strings.Split(version, "--hash=")
   100  	if len(parts) < 2 {
   101  		return version, nil
   102  	}
   103  
   104  	return parts[0], parts[1:]
   105  }
   106  
   107  // trimRequirementsTxtLine removes content from the given requirements.txt line
   108  // that should not be considered for parsing.
   109  func trimRequirementsTxtLine(line string) string {
   110  	line = strings.TrimSpace(line)
   111  	line = removeTrailingComment(line)
   112  	line = removeEnvironmentMarkers(line)
   113  	line = checkForRegex(line) // remove extras and url from line if found
   114  
   115  	return line
   116  }
   117  
   118  // removeTrailingComment takes a requirements.txt line and strips off comment strings.
   119  func removeTrailingComment(line string) string {
   120  	parts := strings.SplitN(line, "#", 2)
   121  	if len(parts) < 2 {
   122  		// there aren't any comments
   123  
   124  		return line
   125  	}
   126  
   127  	return parts[0]
   128  }
   129  
   130  // removeEnvironmentMarkers removes any instances of environment markers (delimited by ';') from the line.
   131  // For more information, see https://www.python.org/dev/peps/pep-0508/#environment-markers.
   132  func removeEnvironmentMarkers(line string) string {
   133  	parts := strings.SplitN(line, ";", 2)
   134  	if len(parts) < 2 {
   135  		// there aren't any environment markers
   136  
   137  		return line
   138  	}
   139  
   140  	return parts[0]
   141  }
   142  
   143  func parseExtras(packageName string) []string {
   144  	if extrasRegex.MatchString(packageName) {
   145  		// Remove square brackets
   146  		extras := strings.TrimFunc(extrasRegex.FindString(packageName), func(r rune) bool {
   147  			return !unicode.IsLetter(r) && !unicode.IsNumber(r)
   148  		})
   149  
   150  		// Remove any additional whitespace
   151  		extras = strings.ReplaceAll(extras, " ", "")
   152  
   153  		return strings.Split(extras, ",")
   154  	}
   155  
   156  	return []string{}
   157  }
   158  
   159  func parseMarkers(line string) map[string]string {
   160  	markers := map[string]string{}
   161  	parts := strings.SplitN(line, ";", 2)
   162  
   163  	if len(parts) == 2 {
   164  		splittableMarkers := parts[1]
   165  
   166  		for _, combineString := range []string{" or ", " and "} {
   167  			splittableMarkers = strings.TrimSpace(
   168  				strings.ReplaceAll(splittableMarkers, combineString, ","),
   169  			)
   170  		}
   171  
   172  		splittableMarkers = strings.TrimSpace(splittableMarkers)
   173  
   174  		for _, mark := range strings.Split(splittableMarkers, ",") {
   175  			markparts := strings.Split(mark, " ")
   176  			markers[markparts[0]] = strings.Join(markparts[1:], " ")
   177  		}
   178  	}
   179  
   180  	return markers
   181  }
   182  
   183  func parseURL(line string) string {
   184  	parts := strings.Split(line, "@")
   185  
   186  	if len(parts) > 1 {
   187  		desiredIndex := -1
   188  
   189  		for index, part := range parts {
   190  			part := strings.TrimFunc(part, func(r rune) bool {
   191  				return !unicode.IsLetter(r) && !unicode.IsNumber(r)
   192  			})
   193  
   194  			if strings.HasPrefix(part, "git") {
   195  				desiredIndex = index
   196  				break
   197  			}
   198  		}
   199  
   200  		if desiredIndex != -1 {
   201  			return strings.TrimSpace(strings.Join(parts[desiredIndex:], "@"))
   202  		}
   203  	}
   204  
   205  	return ""
   206  }
   207  
   208  // function to check a string for all possilbe regex expressions, replacing it if found
   209  func checkForRegex(stringToCheck string) string {
   210  	stringToReturn := stringToCheck
   211  
   212  	for _, r := range []*regexp.Regexp{
   213  		urlRegex,
   214  		extrasRegex,
   215  	} {
   216  		if r.MatchString(stringToCheck) {
   217  			stringToReturn = r.ReplaceAllString(stringToCheck, "")
   218  		}
   219  	}
   220  
   221  	return stringToReturn
   222  }