github.com/noqcks/syft@v0.0.0-20230920222752-a9e2c4e288e5/syft/pkg/cataloger/java/parse_pom_xml.go (about) 1 package java 2 3 import ( 4 "bytes" 5 "encoding/xml" 6 "fmt" 7 "io" 8 "reflect" 9 "regexp" 10 "strings" 11 12 "github.com/saintfish/chardet" 13 "github.com/vifraa/gopom" 14 "golang.org/x/net/html/charset" 15 16 "github.com/anchore/syft/internal/log" 17 "github.com/anchore/syft/syft/artifact" 18 "github.com/anchore/syft/syft/file" 19 "github.com/anchore/syft/syft/pkg" 20 "github.com/anchore/syft/syft/pkg/cataloger/generic" 21 ) 22 23 const pomXMLGlob = "*pom.xml" 24 25 var propertyMatcher = regexp.MustCompile("[$][{][^}]+[}]") 26 27 func parserPomXML(_ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) { 28 pom, err := decodePomXML(reader) 29 if err != nil { 30 return nil, nil, err 31 } 32 33 var pkgs []pkg.Package 34 if pom.Dependencies != nil { 35 for _, dep := range *pom.Dependencies { 36 p := newPackageFromPom( 37 pom, 38 dep, 39 reader.Location.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation), 40 ) 41 if p.Name == "" { 42 continue 43 } 44 45 pkgs = append(pkgs, p) 46 } 47 } 48 49 return pkgs, nil, nil 50 } 51 52 func parsePomXMLProject(path string, reader io.Reader) (*pkg.PomProject, error) { 53 project, err := decodePomXML(reader) 54 if err != nil { 55 return nil, err 56 } 57 return newPomProject(path, project), nil 58 } 59 60 func newPomProject(path string, p gopom.Project) *pkg.PomProject { 61 artifactID := safeString(p.ArtifactID) 62 name := safeString(p.Name) 63 projectURL := safeString(p.URL) 64 log.WithFields("path", path, "artifactID", artifactID, "name", name, "projectURL", projectURL).Trace("parsing pom.xml") 65 return &pkg.PomProject{ 66 Path: path, 67 Parent: pomParent(p, p.Parent), 68 GroupID: resolveProperty(p, p.GroupID, "groupId"), 69 ArtifactID: artifactID, 70 Version: resolveProperty(p, p.Version, "version"), 71 Name: name, 72 Description: cleanDescription(p.Description), 73 URL: projectURL, 74 } 75 } 76 77 func newPackageFromPom(pom gopom.Project, dep gopom.Dependency, locations ...file.Location) pkg.Package { 78 m := pkg.JavaMetadata{ 79 PomProperties: &pkg.PomProperties{ 80 GroupID: resolveProperty(pom, dep.GroupID, "groupId"), 81 ArtifactID: resolveProperty(pom, dep.ArtifactID, "artifactId"), 82 Scope: resolveProperty(pom, dep.Scope, "scope"), 83 }, 84 } 85 86 name := safeString(dep.ArtifactID) 87 version := resolveProperty(pom, dep.Version, "version") 88 89 p := pkg.Package{ 90 Name: name, 91 Version: version, 92 Locations: file.NewLocationSet(locations...), 93 PURL: packageURL(name, version, m), 94 Language: pkg.Java, 95 Type: pkg.JavaPkg, // TODO: should we differentiate between packages from jar/war/zip versus packages from a pom.xml that were not installed yet? 96 MetadataType: pkg.JavaMetadataType, 97 Metadata: m, 98 } 99 100 p.SetID() 101 102 return p 103 } 104 105 func decodePomXML(content io.Reader) (project gopom.Project, err error) { 106 inputReader, err := getUtf8Reader(content) 107 if err != nil { 108 return project, fmt.Errorf("unable to read pom.xml: %w", err) 109 } 110 111 decoder := xml.NewDecoder(inputReader) 112 // when an xml file has a character set declaration (e.g. '<?xml version="1.0" encoding="ISO-8859-1"?>') read that and use the correct decoder 113 decoder.CharsetReader = charset.NewReaderLabel 114 115 if err := decoder.Decode(&project); err != nil { 116 return project, fmt.Errorf("unable to unmarshal pom.xml: %w", err) 117 } 118 119 return project, nil 120 } 121 122 func getUtf8Reader(content io.Reader) (io.Reader, error) { 123 pomContents, err := io.ReadAll(content) 124 if err != nil { 125 return nil, err 126 } 127 128 detector := chardet.NewTextDetector() 129 detection, err := detector.DetectBest(pomContents) 130 131 var inputReader io.Reader 132 if err == nil && detection != nil { 133 if detection.Charset == "UTF-8" { 134 inputReader = bytes.NewReader(pomContents) 135 } else { 136 inputReader, err = charset.NewReaderLabel(detection.Charset, bytes.NewReader(pomContents)) 137 if err != nil { 138 return nil, fmt.Errorf("unable to get encoding: %w", err) 139 } 140 } 141 } else { 142 // we could not detect the encoding, but we want a valid file to read. Replace unreadable 143 // characters with the UTF-8 replacement character. 144 inputReader = strings.NewReader(strings.ToValidUTF8(string(pomContents), "�")) 145 } 146 return inputReader, nil 147 } 148 149 func pomParent(pom gopom.Project, parent *gopom.Parent) (result *pkg.PomParent) { 150 if parent == nil { 151 return nil 152 } 153 154 artifactID := safeString(parent.ArtifactID) 155 result = &pkg.PomParent{ 156 GroupID: resolveProperty(pom, parent.GroupID, "groupId"), 157 ArtifactID: artifactID, 158 Version: resolveProperty(pom, parent.Version, "version"), 159 } 160 161 if result.GroupID == "" && result.ArtifactID == "" && result.Version == "" { 162 return nil 163 } 164 return result 165 } 166 167 func cleanDescription(original *string) (cleaned string) { 168 if original == nil { 169 return "" 170 } 171 descriptionLines := strings.Split(*original, "\n") 172 for _, line := range descriptionLines { 173 line = strings.TrimSpace(line) 174 if len(line) == 0 { 175 continue 176 } 177 cleaned += line + " " 178 } 179 return strings.TrimSpace(cleaned) 180 } 181 182 // resolveProperty emulates some maven property resolution logic by looking in the project's variables 183 // as well as supporting the project expressions like ${project.parent.groupId}. 184 // If no match is found, the entire expression including ${} is returned 185 // 186 //nolint:gocognit 187 func resolveProperty(pom gopom.Project, property *string, propertyName string) string { 188 propertyCase := safeString(property) 189 log.WithFields("existingPropertyValue", propertyCase, "propertyName", propertyName).Trace("resolving property") 190 return propertyMatcher.ReplaceAllStringFunc(propertyCase, func(match string) string { 191 propertyName := strings.TrimSpace(match[2 : len(match)-1]) // remove leading ${ and trailing } 192 entries := pomProperties(pom) 193 if value, ok := entries[propertyName]; ok { 194 return value 195 } 196 197 // if we don't find anything directly in the pom properties, 198 // see if we have a project.x expression and process this based 199 // on the xml tags in gopom 200 parts := strings.Split(propertyName, ".") 201 numParts := len(parts) 202 if numParts > 1 && strings.TrimSpace(parts[0]) == "project" { 203 pomValue := reflect.ValueOf(pom) 204 pomValueType := pomValue.Type() 205 for partNum := 1; partNum < numParts; partNum++ { 206 if pomValueType.Kind() != reflect.Struct { 207 break 208 } 209 part := parts[partNum] 210 for fieldNum := 0; fieldNum < pomValueType.NumField(); fieldNum++ { 211 f := pomValueType.Field(fieldNum) 212 tag := f.Tag.Get("xml") 213 tag = strings.Split(tag, ",")[0] 214 // a segment of the property name matches the xml tag for the field, 215 // so we need to recurse down the nested structs or return a match 216 // if we're done. 217 if part == tag { 218 pomValue = pomValue.Field(fieldNum) 219 pomValueType = pomValue.Type() 220 if pomValueType.Kind() == reflect.Ptr { 221 // we were recursing down the nested structs, but one of the steps 222 // we need to take is a nil pointer, so give up and return the original match 223 if pomValue.IsNil() { 224 return match 225 } 226 pomValue = pomValue.Elem() 227 if !pomValue.IsZero() { 228 // we found a non-zero value whose tag matches this part of the property name 229 pomValueType = pomValue.Type() 230 } 231 } 232 // If this was the last part of the property name, return the value 233 if partNum == numParts-1 { 234 return fmt.Sprintf("%v", pomValue.Interface()) 235 } 236 break 237 } 238 } 239 } 240 } 241 return match 242 }) 243 } 244 245 func pomProperties(p gopom.Project) map[string]string { 246 if p.Properties != nil { 247 return p.Properties.Entries 248 } 249 return map[string]string{} 250 } 251 252 func safeString(s *string) string { 253 if s == nil { 254 return "" 255 } 256 return *s 257 }