github.com/anchore/syft@v1.4.2-0.20240516191711-1bec1fc5d397/syft/pkg/cataloger/java/parse_pom_xml.go (about)

     1  package java
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"encoding/xml"
     7  	"fmt"
     8  	"io"
     9  	"reflect"
    10  	"regexp"
    11  	"strings"
    12  
    13  	"github.com/saintfish/chardet"
    14  	"github.com/vifraa/gopom"
    15  	"golang.org/x/net/html/charset"
    16  
    17  	"github.com/anchore/syft/internal/log"
    18  	"github.com/anchore/syft/syft/artifact"
    19  	"github.com/anchore/syft/syft/file"
    20  	"github.com/anchore/syft/syft/pkg"
    21  	"github.com/anchore/syft/syft/pkg/cataloger/generic"
    22  )
    23  
    24  const pomXMLGlob = "*pom.xml"
    25  
    26  var propertyMatcher = regexp.MustCompile("[$][{][^}]+[}]")
    27  
    28  func (gap genericArchiveParserAdapter) parserPomXML(ctx context.Context, _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) {
    29  	pom, err := decodePomXML(reader)
    30  	if err != nil {
    31  		return nil, nil, err
    32  	}
    33  
    34  	var pkgs []pkg.Package
    35  	if pom.Dependencies != nil {
    36  		for _, dep := range *pom.Dependencies {
    37  			p := newPackageFromPom(
    38  				ctx,
    39  				pom,
    40  				dep,
    41  				gap.cfg,
    42  				reader.Location.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation),
    43  			)
    44  			if p.Name == "" {
    45  				continue
    46  			}
    47  
    48  			pkgs = append(pkgs, p)
    49  		}
    50  	}
    51  
    52  	return pkgs, nil, nil
    53  }
    54  
    55  func parsePomXMLProject(path string, reader io.Reader, location file.Location) (*parsedPomProject, error) {
    56  	project, err := decodePomXML(reader)
    57  	if err != nil {
    58  		return nil, err
    59  	}
    60  	return newPomProject(path, project, location), nil
    61  }
    62  
    63  func newPomProject(path string, p gopom.Project, location file.Location) *parsedPomProject {
    64  	artifactID := safeString(p.ArtifactID)
    65  	name := safeString(p.Name)
    66  	projectURL := safeString(p.URL)
    67  
    68  	var licenses []pkg.License
    69  	if p.Licenses != nil {
    70  		for _, license := range *p.Licenses {
    71  			var licenseName, licenseURL string
    72  			if license.Name != nil {
    73  				licenseName = *license.Name
    74  			}
    75  			if license.URL != nil {
    76  				licenseURL = *license.URL
    77  			}
    78  
    79  			if licenseName == "" && licenseURL == "" {
    80  				continue
    81  			}
    82  
    83  			licenses = append(licenses, pkg.NewLicenseFromFields(licenseName, licenseURL, &location))
    84  		}
    85  	}
    86  
    87  	log.WithFields("path", path, "artifactID", artifactID, "name", name, "projectURL", projectURL).Trace("parsing pom.xml")
    88  	return &parsedPomProject{
    89  		JavaPomProject: &pkg.JavaPomProject{
    90  			Path:        path,
    91  			Parent:      pomParent(p, p.Parent),
    92  			GroupID:     resolveProperty(p, p.GroupID, "groupId"),
    93  			ArtifactID:  artifactID,
    94  			Version:     resolveProperty(p, p.Version, "version"),
    95  			Name:        name,
    96  			Description: cleanDescription(p.Description),
    97  			URL:         projectURL,
    98  		},
    99  		Licenses: licenses,
   100  	}
   101  }
   102  
   103  func newPackageFromPom(ctx context.Context, pom gopom.Project, dep gopom.Dependency, cfg ArchiveCatalogerConfig, locations ...file.Location) pkg.Package {
   104  	m := pkg.JavaArchive{
   105  		PomProperties: &pkg.JavaPomProperties{
   106  			GroupID:    resolveProperty(pom, dep.GroupID, "groupId"),
   107  			ArtifactID: resolveProperty(pom, dep.ArtifactID, "artifactId"),
   108  			Scope:      resolveProperty(pom, dep.Scope, "scope"),
   109  		},
   110  	}
   111  
   112  	name := safeString(dep.ArtifactID)
   113  	version := resolveProperty(pom, dep.Version, "version")
   114  
   115  	licenses := make([]pkg.License, 0)
   116  	if cfg.UseNetwork {
   117  		if version == "" {
   118  			// If we have no version then let's try to get it from a parent pom DependencyManagement section
   119  			version = recursivelyFindVersionFromParentPom(ctx, *dep.GroupID, *dep.ArtifactID, *pom.Parent.GroupID, *pom.Parent.ArtifactID, *pom.Parent.Version, cfg)
   120  		}
   121  		if version != "" {
   122  			parentLicenses := recursivelyFindLicensesFromParentPom(
   123  				ctx,
   124  				m.PomProperties.GroupID,
   125  				m.PomProperties.ArtifactID,
   126  				version,
   127  				cfg)
   128  
   129  			if len(parentLicenses) > 0 {
   130  				for _, licenseName := range parentLicenses {
   131  					licenses = append(licenses, pkg.NewLicenseFromFields(licenseName, "", nil))
   132  				}
   133  			}
   134  		}
   135  	}
   136  
   137  	p := pkg.Package{
   138  		Name:      name,
   139  		Version:   version,
   140  		Locations: file.NewLocationSet(locations...),
   141  		Licenses:  pkg.NewLicenseSet(licenses...),
   142  		PURL:      packageURL(name, version, m),
   143  		Language:  pkg.Java,
   144  		Type:      pkg.JavaPkg, // TODO: should we differentiate between packages from jar/war/zip versus packages from a pom.xml that were not installed yet?
   145  		Metadata:  m,
   146  	}
   147  
   148  	p.SetID()
   149  
   150  	return p
   151  }
   152  
   153  func decodePomXML(content io.Reader) (project gopom.Project, err error) {
   154  	inputReader, err := getUtf8Reader(content)
   155  	if err != nil {
   156  		return project, fmt.Errorf("unable to read pom.xml: %w", err)
   157  	}
   158  
   159  	decoder := xml.NewDecoder(inputReader)
   160  	// when an xml file has a character set declaration (e.g. '<?xml version="1.0" encoding="ISO-8859-1"?>') read that and use the correct decoder
   161  	decoder.CharsetReader = charset.NewReaderLabel
   162  
   163  	if err := decoder.Decode(&project); err != nil {
   164  		return project, fmt.Errorf("unable to unmarshal pom.xml: %w", err)
   165  	}
   166  
   167  	return project, nil
   168  }
   169  
   170  func getUtf8Reader(content io.Reader) (io.Reader, error) {
   171  	pomContents, err := io.ReadAll(content)
   172  	if err != nil {
   173  		return nil, err
   174  	}
   175  
   176  	detector := chardet.NewTextDetector()
   177  	detection, err := detector.DetectBest(pomContents)
   178  
   179  	var inputReader io.Reader
   180  	if err == nil && detection != nil {
   181  		if detection.Charset == "UTF-8" {
   182  			inputReader = bytes.NewReader(pomContents)
   183  		} else {
   184  			inputReader, err = charset.NewReaderLabel(detection.Charset, bytes.NewReader(pomContents))
   185  			if err != nil {
   186  				return nil, fmt.Errorf("unable to get encoding: %w", err)
   187  			}
   188  		}
   189  	} else {
   190  		// we could not detect the encoding, but we want a valid file to read. Replace unreadable
   191  		// characters with the UTF-8 replacement character.
   192  		inputReader = strings.NewReader(strings.ToValidUTF8(string(pomContents), "�"))
   193  	}
   194  	return inputReader, nil
   195  }
   196  
   197  func pomParent(pom gopom.Project, parent *gopom.Parent) (result *pkg.JavaPomParent) {
   198  	if parent == nil {
   199  		return nil
   200  	}
   201  
   202  	artifactID := safeString(parent.ArtifactID)
   203  	result = &pkg.JavaPomParent{
   204  		GroupID:    resolveProperty(pom, parent.GroupID, "groupId"),
   205  		ArtifactID: artifactID,
   206  		Version:    resolveProperty(pom, parent.Version, "version"),
   207  	}
   208  
   209  	if result.GroupID == "" && result.ArtifactID == "" && result.Version == "" {
   210  		return nil
   211  	}
   212  	return result
   213  }
   214  
   215  func cleanDescription(original *string) (cleaned string) {
   216  	if original == nil {
   217  		return ""
   218  	}
   219  	descriptionLines := strings.Split(*original, "\n")
   220  	for _, line := range descriptionLines {
   221  		line = strings.TrimSpace(line)
   222  		if len(line) == 0 {
   223  			continue
   224  		}
   225  		cleaned += line + " "
   226  	}
   227  	return strings.TrimSpace(cleaned)
   228  }
   229  
   230  // resolveProperty emulates some maven property resolution logic by looking in the project's variables
   231  // as well as supporting the project expressions like ${project.parent.groupId}.
   232  // If no match is found, the entire expression including ${} is returned
   233  func resolveProperty(pom gopom.Project, property *string, propertyName string) string {
   234  	propertyCase := safeString(property)
   235  	log.WithFields("existingPropertyValue", propertyCase, "propertyName", propertyName).Trace("resolving property")
   236  	seenBeforePropertyNames := map[string]struct{}{
   237  		propertyName: {},
   238  	}
   239  	result := recursiveResolveProperty(pom, propertyCase, seenBeforePropertyNames)
   240  	if propertyMatcher.MatchString(result) {
   241  		return "" // dereferencing variable failed; fall back to empty string
   242  	}
   243  	return result
   244  }
   245  
   246  //nolint:gocognit
   247  func recursiveResolveProperty(pom gopom.Project, propertyCase string, seenPropertyNames map[string]struct{}) string {
   248  	return propertyMatcher.ReplaceAllStringFunc(propertyCase, func(match string) string {
   249  		propertyName := strings.TrimSpace(match[2 : len(match)-1]) // remove leading ${ and trailing }
   250  		if _, seen := seenPropertyNames[propertyName]; seen {
   251  			return propertyCase
   252  		}
   253  		entries := pomProperties(pom)
   254  		if value, ok := entries[propertyName]; ok {
   255  			seenPropertyNames[propertyName] = struct{}{}
   256  			return recursiveResolveProperty(pom, value, seenPropertyNames) // recursively resolve in case a variable points to a variable.
   257  		}
   258  
   259  		// if we don't find anything directly in the pom properties,
   260  		// see if we have a project.x expression and process this based
   261  		// on the xml tags in gopom
   262  		parts := strings.Split(propertyName, ".")
   263  		numParts := len(parts)
   264  		if numParts > 1 && strings.TrimSpace(parts[0]) == "project" {
   265  			pomValue := reflect.ValueOf(pom)
   266  			pomValueType := pomValue.Type()
   267  			for partNum := 1; partNum < numParts; partNum++ {
   268  				if pomValueType.Kind() != reflect.Struct {
   269  					break
   270  				}
   271  				part := parts[partNum]
   272  				for fieldNum := 0; fieldNum < pomValueType.NumField(); fieldNum++ {
   273  					f := pomValueType.Field(fieldNum)
   274  					tag := f.Tag.Get("xml")
   275  					tag = strings.Split(tag, ",")[0]
   276  					// a segment of the property name matches the xml tag for the field,
   277  					// so we need to recurse down the nested structs or return a match
   278  					// if we're done.
   279  					if part == tag {
   280  						pomValue = pomValue.Field(fieldNum)
   281  						pomValueType = pomValue.Type()
   282  						if pomValueType.Kind() == reflect.Ptr {
   283  							// we were recursing down the nested structs, but one of the steps
   284  							// we need to take is a nil pointer, so give up and return the original match
   285  							if pomValue.IsNil() {
   286  								return match
   287  							}
   288  							pomValue = pomValue.Elem()
   289  							if !pomValue.IsZero() {
   290  								// we found a non-zero value whose tag matches this part of the property name
   291  								pomValueType = pomValue.Type()
   292  							}
   293  						}
   294  						// If this was the last part of the property name, return the value
   295  						if partNum == numParts-1 {
   296  							return fmt.Sprintf("%v", pomValue.Interface())
   297  						}
   298  						break
   299  					}
   300  				}
   301  			}
   302  		}
   303  		return match
   304  	})
   305  }
   306  
   307  func pomProperties(p gopom.Project) map[string]string {
   308  	if p.Properties != nil {
   309  		return p.Properties.Entries
   310  	}
   311  	return map[string]string{}
   312  }
   313  
   314  func safeString(s *string) string {
   315  	if s == nil {
   316  		return ""
   317  	}
   318  	return *s
   319  }