github.com/noqcks/syft@v0.0.0-20230920222752-a9e2c4e288e5/syft/pkg/cataloger/java/archive_filename.go (about)

     1  package java
     2  
     3  import (
     4  	"path/filepath"
     5  	"regexp"
     6  	"strings"
     7  
     8  	"github.com/anchore/syft/internal/log"
     9  	"github.com/anchore/syft/syft/pkg"
    10  )
    11  
    12  // nameAndVersionPattern finds the package name and version (as named capture
    13  // groups) in a string. The pattern's strategy is to start at the beginning of
    14  // the string, and for every next dash-delimited group, consider the group to be
    15  // a continuation of the package name, unless the group begins with a number or
    16  // matches any of a specified set of "version-indicating" patterns. When a given
    17  // group meets this criterion, consider the group and the remainder of the
    18  // string to be the package version.
    19  //
    20  // Regex components of note:
    21  //
    22  // (?Ui)													...	Sets the "U" and the "i" options for this Regex —— (ungreedy,
    23  // and case-insensitive, respectively). "Ungreedy" is important so that the '*' that trails the package name
    24  // component doesn't consume the rest of the string.
    25  //
    26  // [[:alpha:]][[:word:].]*									...	Matches any word, and the word can include "word" characters (
    27  // which includes numbers and underscores), and periods, but the first character of the word MUST be a letter.
    28  //
    29  // (?:\.[[:alpha:]][[:word:].]*)* 							... This looks redundant, but it's not. It
    30  // extends the previous pattern such that the net effect of both components is
    31  // that words can also include a period and more words (thus, when combined, not
    32  // only is "something" matched, but so is "com.prefix.thing"
    33  //
    34  // (?:\d.*|(?:build\d*.*)|(?:rc?\d+(?:^[[:alpha:]].*)?))	...
    35  // This match group covers the "version-indicating" patterns mentioned in the above description. Given the pipes (
    36  // '|'), this functions as a series of 'OR'-joined conditions:
    37  //
    38  //	\d.*						...	"If it starts with a numeric digit, this is a version, no matter what follows."
    39  //	build\d*.*					...	"If it starts with "build" and then a numeric digit immediately after, this is a version."
    40  //	rc?\d+(?:^[[:alpha:]].*)?	...	"If it starts with "r" or "rc" and then one or more numeric digits immediately
    41  //									after, but no alpha characters right after that (in the same word), this is a version."
    42  //
    43  // Match examples:
    44  //
    45  //	some-package-4.0.1		--> name="some-package", version="4.0.1"
    46  //	prefix.thing-4			-->	name="prefix.thing", version="4"
    47  //	my-http2-server-5		-->	name="my-http2-server", version="5"
    48  //	jetpack-build235-rc5	-->	name="jetpack", version="build2.0-rc5"
    49  //	ironman-r4-2009			--> name="ironman", version="r4-2009"
    50  var nameAndVersionPattern = regexp.MustCompile(`(?Ui)^(?P<name>(?:[[:alpha:]][[:word:].]*(?:\.[[:alpha:]][[:word:].]*)*-?)+)(?:-(?P<version>(\d.*|(build\d+.*)|(rc?\d+(?:^[[:alpha:]].*)?))))?$`)
    51  var secondaryVersionPattern = regexp.MustCompile(`(?:[._-](?P<version>(\d.*|(build\d+.*)|(rc?\d+(?:^[[:alpha:]].*)?))))?$`)
    52  
    53  type archiveFilename struct {
    54  	raw     string
    55  	name    string
    56  	version string
    57  }
    58  
    59  func getSubexp(matches []string, subexpName string, re *regexp.Regexp, raw string) string {
    60  	if len(matches) < 1 {
    61  		log.Warnf("unexpectedly empty matches for archive '%s'", raw)
    62  		return ""
    63  	}
    64  
    65  	index := re.SubexpIndex(subexpName)
    66  	if index < 1 {
    67  		log.Warnf("unexpected index of '%s' capture group for Java archive '%s'", subexpName, raw)
    68  		return ""
    69  	}
    70  
    71  	// Prevent out-of-range panic
    72  	if len(matches) < index+1 {
    73  		log.Warnf("no match found for '%s' in '%s'", subexpName, matches[0])
    74  		return ""
    75  	}
    76  
    77  	return matches[index]
    78  }
    79  
    80  func newJavaArchiveFilename(raw string) archiveFilename {
    81  	// trim the file extension and remove any path prefixes
    82  	cleanedFileName := strings.TrimSuffix(filepath.Base(raw), filepath.Ext(raw))
    83  
    84  	matches := nameAndVersionPattern.FindStringSubmatch(cleanedFileName)
    85  
    86  	name := getSubexp(matches, "name", nameAndVersionPattern, raw)
    87  	version := getSubexp(matches, "version", nameAndVersionPattern, raw)
    88  
    89  	// some jars get named with different conventions, like `_<version>` or `.<version>`
    90  	if version == "" {
    91  		matches = secondaryVersionPattern.FindStringSubmatch(name)
    92  		version = getSubexp(matches, "version", secondaryVersionPattern, raw)
    93  		if version != "" {
    94  			name = name[0 : len(name)-len(version)-1]
    95  		}
    96  	}
    97  
    98  	return archiveFilename{
    99  		raw:     raw,
   100  		name:    name,
   101  		version: version,
   102  	}
   103  }
   104  
   105  func (a archiveFilename) extension() string {
   106  	return strings.TrimPrefix(filepath.Ext(a.raw), ".")
   107  }
   108  
   109  func (a archiveFilename) pkgType() pkg.Type {
   110  	switch strings.ToLower(a.extension()) {
   111  	case "jar", "war", "ear", "lpkg", "par", "sar", "nar":
   112  		return pkg.JavaPkg
   113  	case "jpi", "hpi":
   114  		return pkg.JenkinsPluginPkg
   115  	default:
   116  		return pkg.UnknownPkg
   117  	}
   118  }