github.com/anchore/syft@v1.38.2/syft/pkg/cataloger/java/internal/maven/pom_parser.go (about)

     1  package maven
     2  
     3  import (
     4  	"bytes"
     5  	"encoding/xml"
     6  	"fmt"
     7  	"io"
     8  	"strings"
     9  
    10  	"github.com/saintfish/chardet"
    11  	"github.com/vifraa/gopom"
    12  	"golang.org/x/net/html/charset"
    13  )
    14  
    15  type (
    16  	Project    = gopom.Project
    17  	Properties = gopom.Properties
    18  	Parent     = gopom.Parent
    19  	Dependency = gopom.Dependency
    20  	License    = gopom.License
    21  )
    22  
    23  // ParsePomXML decodes a pom XML file, detecting and converting non-UTF-8 charsets. this DOES NOT perform any logic to resolve properties such as groupID, artifactID, and version
    24  func ParsePomXML(content io.Reader) (project *Project, err error) {
    25  	inputReader, err := getUtf8Reader(content)
    26  	if err != nil {
    27  		return nil, fmt.Errorf("unable to read pom.xml: %w", err)
    28  	}
    29  
    30  	decoder := xml.NewDecoder(inputReader)
    31  	// when an xml file has a character set declaration (e.g. '<?xml version="1.0" encoding="ISO-8859-1"?>') read that and use the correct decoder
    32  	decoder.CharsetReader = charset.NewReaderLabel
    33  
    34  	project = &Project{}
    35  	if err := decoder.Decode(project); err != nil {
    36  		return nil, fmt.Errorf("unable to unmarshal pom.xml: %w", err)
    37  	}
    38  
    39  	return project, nil
    40  }
    41  
    42  func getUtf8Reader(content io.Reader) (io.Reader, error) {
    43  	pomContents, err := io.ReadAll(content)
    44  	if err != nil {
    45  		return nil, err
    46  	}
    47  
    48  	detector := chardet.NewTextDetector()
    49  	detection, err := detector.DetectBest(pomContents)
    50  
    51  	var inputReader io.Reader
    52  	if err == nil && detection != nil {
    53  		if detection.Charset == "UTF-8" {
    54  			inputReader = bytes.NewReader(pomContents)
    55  		} else {
    56  			inputReader, err = charset.NewReaderLabel(detection.Charset, bytes.NewReader(pomContents))
    57  			if err != nil {
    58  				return nil, fmt.Errorf("unable to get encoding: %w", err)
    59  			}
    60  		}
    61  	} else {
    62  		// we could not detect the encoding, but we want a valid file to read. Replace unreadable
    63  		// characters with the UTF-8 replacement character.
    64  		inputReader = strings.NewReader(strings.ToValidUTF8(string(pomContents), "�"))
    65  	}
    66  	return inputReader, nil
    67  }