github.com/google/osv-scalibr@v0.4.1/extractor/filesystem/language/java/archive/manifest.go (about)

     1  // Copyright 2025 Google LLC
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package archive
    16  
    17  import (
    18  	"archive/zip"
    19  	"bufio"
    20  	"errors"
    21  	"fmt"
    22  	"io"
    23  	"net/textproto"
    24  	"regexp"
    25  	"strings"
    26  
    27  	"github.com/google/osv-scalibr/extractor/filesystem/language/java/groupid"
    28  	"github.com/google/osv-scalibr/log"
    29  )
    30  
    31  var (
    32  	nameToArtifactID = map[string]string{
    33  		"org/apache/axis": "axis",
    34  	}
    35  )
    36  
    37  // manifest for identifying Maven package.
    38  type manifest struct {
    39  	GroupID    string
    40  	ArtifactID string
    41  	Version    string
    42  }
    43  
    44  // valid returns true if mf is a valid manifest property.
    45  func (mf manifest) valid() bool {
    46  	return mf.GroupID != "" && mf.ArtifactID != "" && mf.Version != ""
    47  }
    48  
    49  func parseManifest(f *zip.File) (manifest, error) {
    50  	file, err := f.Open()
    51  	if err != nil {
    52  		return manifest{}, fmt.Errorf("failed to open file %q: %w", f.Name, err)
    53  	}
    54  	defer file.Close()
    55  
    56  	log.Debugf("Parsing manifest file %s\n", f.Name)
    57  
    58  	rd := textproto.NewReader((bufio.NewReader(NewOmitEmptyLinesReader(file))))
    59  	h, err := rd.ReadMIMEHeader()
    60  	// MIME header require \n\n in the end, while MANIFEST.mf might not have this. Headers before are
    61  	// parsed correctly anyway, so skip the error and continue.
    62  	if err != nil && !errors.Is(err, io.EOF) {
    63  		return manifest{}, fmt.Errorf("failed to read MIME header: %w", err)
    64  	}
    65  
    66  	artifactID := getArtifactID(h)
    67  	groupID := getGroupID(h)
    68  	// Some known packages have incorrect group IDs. Check if this is the case and update the group ID.
    69  	if newGroupID := groupid.FromArtifactID(artifactID); newGroupID != "" {
    70  		groupID = newGroupID
    71  	}
    72  
    73  	return manifest{
    74  		GroupID:    groupID,
    75  		ArtifactID: artifactID,
    76  		Version:    getVersion(h),
    77  	}, nil
    78  }
    79  
    80  var (
    81  	groupIDFinder = regexp.MustCompile(`[a-zA-Z0-9-_.]+`)
    82  )
    83  
    84  // Transforms for manifest fields that need a little work to extract the group
    85  // ID. Note that we intentionally do not combine this as part of the `keys` in
    86  // the `getGroupID` function because we want to maintain ordering of the keys to
    87  // preserve priority of fields.
    88  var groupIDTransforms = map[string]func(string) string{
    89  
    90  	// The `Implementation-Title` field can have the group ID as the first part of
    91  	// the value, with other info appended to it.  To extract it, we simply try to
    92  	// pull out the first domain-like string in the value.
    93  	//
    94  	// For example, elasticsearch-8.14.3.jar has a manifest with the following:
    95  	//
    96  	//	Implementation-Title: org.elasticsearch#server;8.14.3
    97  	//
    98  	// And we simply want to extract `org.elasticsearch`, which would be the first
    99  	// match for the regex.
   100  	"Implementation-Title": func(s string) string {
   101  		// Get the first match for a domain-like string.
   102  		return groupIDFinder.FindString(s)
   103  	},
   104  }
   105  
   106  func getGroupID(h textproto.MIMEHeader) string {
   107  	keys := []string{
   108  		"Bundle-SymbolicName",
   109  		"Extension-Name",
   110  		"Specification-Vendor",
   111  		"Implementation-Vendor",
   112  		"Implementation-Vendor-Id",
   113  		"Implementation-Title",
   114  		"Bundle-Activator",
   115  		"Automatic-Module-Name",
   116  		"Main-Class",
   117  		"Package",
   118  	}
   119  	log.Debug("Potential group ids:")
   120  	for _, k := range keys {
   121  		log.Debugf("  %s: %s\n", k, h.Get(k))
   122  	}
   123  
   124  	g := getFirstValidGroupID(h, keys)
   125  	if strings.Contains(g, ";") {
   126  		g = strings.Split(g, ";")[0]
   127  	}
   128  	return g
   129  }
   130  
   131  func getFirstValidGroupID(h textproto.MIMEHeader, names []string) string {
   132  	for _, n := range names {
   133  		groupID := h.Get(n)
   134  		if transform, ok := groupIDTransforms[n]; ok {
   135  			groupID = transform(groupID)
   136  		}
   137  		if validGroupID(groupID) {
   138  			return strings.ToLower(groupID)
   139  		}
   140  	}
   141  	return ""
   142  }
   143  
   144  func validGroupID(name string) bool {
   145  	return name != "" && !strings.Contains(name, " ")
   146  }
   147  
   148  func getArtifactID(h textproto.MIMEHeader) string {
   149  	id := getArtifactIDForBundlePlugin(h)
   150  	if id != "" {
   151  		return id
   152  	}
   153  
   154  	id = getKnownArtifactIDFromName(h)
   155  	if id != "" {
   156  		return id
   157  	}
   158  
   159  	keys := []string{
   160  		"Name",
   161  		"Implementation-Title",
   162  		"Specification-Title",
   163  		"Bundle-Name",
   164  		"Short-Name",
   165  		"Extension-Name",
   166  	}
   167  	log.Debug("Potential artifact ids:")
   168  	for _, k := range keys {
   169  		log.Debugf("  %s: %s\n", k, h.Get(k))
   170  	}
   171  	return getFirstValidArtifactID(h, keys)
   172  }
   173  
   174  func getVersion(h textproto.MIMEHeader) string {
   175  	keys := []string{
   176  		"Implementation-Version",
   177  		"Specification-Version",
   178  		"Plugin-Version",
   179  		"Bundle-Version",
   180  	}
   181  	log.Debug("Potential version:")
   182  	for _, k := range keys {
   183  		log.Debugf("  %s: %s\n", k, h.Get(k))
   184  	}
   185  
   186  	// Some versions contain extra information like the build number or date.
   187  	// For example "1.4 1855 April 22 2006"
   188  	// We only want the first part.
   189  	version := getFirst(h, keys)
   190  	version = strings.Split(version, " ")[0]
   191  
   192  	return version
   193  }
   194  
   195  func getFirst(h textproto.MIMEHeader, names []string) string {
   196  	for _, n := range names {
   197  		if h.Get(n) != "" {
   198  			return h.Get(n)
   199  		}
   200  	}
   201  	return ""
   202  }
   203  
   204  // getArtifactIDForBundlePlugin returns the package name for an Apache Maven Bundle Plugin.
   205  //
   206  // For these plugins, the package name is the last part of `Bundle-SymbolicName`.
   207  // For example, the package for `Bundle-SymbolicName: com.google.guava.failureaccess` is `failureaccess`
   208  // https://svn.apache.org/repos/asf/felix/releases/maven-bundle-plugin-1.2.0/doc/maven-bundle-plugin-bnd.html
   209  // https://felix.apache.org/documentation/subprojects/apache-felix-maven-bundle-plugin-bnd.html
   210  func getArtifactIDForBundlePlugin(h textproto.MIMEHeader) string {
   211  	if h.Get("Created-By") != "Apache Maven Bundle Plugin" {
   212  		return ""
   213  	}
   214  	symbolicName := h.Get("Bundle-SymbolicName")
   215  	if symbolicName == "" {
   216  		return ""
   217  	}
   218  	parts := strings.Split(symbolicName, ".")
   219  
   220  	artifactID := parts[len(parts)-1]
   221  	if validArtifactID(artifactID) {
   222  		return artifactID
   223  	}
   224  
   225  	return ""
   226  }
   227  
   228  // getKnownArtifactIDFromName returns the artifact ID known packages that have an incorrect artifact ID.
   229  //
   230  // For example, the Apache Axis package has the following Name in it's manifest:
   231  //
   232  //	Name: org/apache/axis
   233  //
   234  // But the correct artifact ID is `axis`.
   235  func getKnownArtifactIDFromName(h textproto.MIMEHeader) string {
   236  	return nameToArtifactID[h.Get("Name")]
   237  }
   238  
   239  func getFirstValidArtifactID(h textproto.MIMEHeader, names []string) string {
   240  	for _, n := range names {
   241  		if validArtifactID(h.Get(n)) {
   242  			return h.Get(n)
   243  		}
   244  	}
   245  	return ""
   246  }
   247  
   248  func validArtifactID(name string) bool {
   249  	if name == "" || strings.Contains(name, " ") {
   250  		return false
   251  	}
   252  
   253  	// e.g. "${org.eclipse.jdt.annotation.bundleName}"
   254  	// b/298196886#comment9
   255  	if strings.HasPrefix(name, "$") {
   256  		return false
   257  	}
   258  
   259  	// e.g. "%pluginName"
   260  	// b/298196886#comment10
   261  	if strings.HasPrefix(name, "%") {
   262  		return false
   263  	}
   264  
   265  	return true
   266  }
   267  
   268  // NewOmitEmptyLinesReader returns a new reader that omits empty lines from the input reader.
   269  func NewOmitEmptyLinesReader(r io.Reader) io.Reader {
   270  	pr, pw := io.Pipe()
   271  
   272  	go func() {
   273  		defer pw.Close()
   274  		scanner := bufio.NewScanner(r)
   275  		for scanner.Scan() {
   276  			line := scanner.Text()
   277  			if line != "" {
   278  				_, _ = pw.Write([]byte(line + "\n"))
   279  			}
   280  		}
   281  		if err := scanner.Err(); err != nil {
   282  			_ = pw.CloseWithError(err)
   283  		}
   284  	}()
   285  
   286  	return pr
   287  }