github.com/anchore/syft@v1.38.2/syft/pkg/cataloger/java/internal/maven/pom_parser.go (about) 1 package maven 2 3 import ( 4 "bytes" 5 "encoding/xml" 6 "fmt" 7 "io" 8 "strings" 9 10 "github.com/saintfish/chardet" 11 "github.com/vifraa/gopom" 12 "golang.org/x/net/html/charset" 13 ) 14 15 type ( 16 Project = gopom.Project 17 Properties = gopom.Properties 18 Parent = gopom.Parent 19 Dependency = gopom.Dependency 20 License = gopom.License 21 ) 22 23 // ParsePomXML decodes a pom XML file, detecting and converting non-UTF-8 charsets. this DOES NOT perform any logic to resolve properties such as groupID, artifactID, and version 24 func ParsePomXML(content io.Reader) (project *Project, err error) { 25 inputReader, err := getUtf8Reader(content) 26 if err != nil { 27 return nil, fmt.Errorf("unable to read pom.xml: %w", err) 28 } 29 30 decoder := xml.NewDecoder(inputReader) 31 // when an xml file has a character set declaration (e.g. '<?xml version="1.0" encoding="ISO-8859-1"?>') read that and use the correct decoder 32 decoder.CharsetReader = charset.NewReaderLabel 33 34 project = &Project{} 35 if err := decoder.Decode(project); err != nil { 36 return nil, fmt.Errorf("unable to unmarshal pom.xml: %w", err) 37 } 38 39 return project, nil 40 } 41 42 func getUtf8Reader(content io.Reader) (io.Reader, error) { 43 pomContents, err := io.ReadAll(content) 44 if err != nil { 45 return nil, err 46 } 47 48 detector := chardet.NewTextDetector() 49 detection, err := detector.DetectBest(pomContents) 50 51 var inputReader io.Reader 52 if err == nil && detection != nil { 53 if detection.Charset == "UTF-8" { 54 inputReader = bytes.NewReader(pomContents) 55 } else { 56 inputReader, err = charset.NewReaderLabel(detection.Charset, bytes.NewReader(pomContents)) 57 if err != nil { 58 return nil, fmt.Errorf("unable to get encoding: %w", err) 59 } 60 } 61 } else { 62 // we could not detect the encoding, but we want a valid file to read. Replace unreadable 63 // characters with the UTF-8 replacement character. 64 inputReader = strings.NewReader(strings.ToValidUTF8(string(pomContents), "�")) 65 } 66 return inputReader, nil 67 }