github.com/google/osv-scalibr@v0.4.1/extractor/filesystem/language/java/archive/manifest.go (about) 1 // Copyright 2025 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package archive 16 17 import ( 18 "archive/zip" 19 "bufio" 20 "errors" 21 "fmt" 22 "io" 23 "net/textproto" 24 "regexp" 25 "strings" 26 27 "github.com/google/osv-scalibr/extractor/filesystem/language/java/groupid" 28 "github.com/google/osv-scalibr/log" 29 ) 30 31 var ( 32 nameToArtifactID = map[string]string{ 33 "org/apache/axis": "axis", 34 } 35 ) 36 37 // manifest for identifying Maven package. 38 type manifest struct { 39 GroupID string 40 ArtifactID string 41 Version string 42 } 43 44 // valid returns true if mf is a valid manifest property. 45 func (mf manifest) valid() bool { 46 return mf.GroupID != "" && mf.ArtifactID != "" && mf.Version != "" 47 } 48 49 func parseManifest(f *zip.File) (manifest, error) { 50 file, err := f.Open() 51 if err != nil { 52 return manifest{}, fmt.Errorf("failed to open file %q: %w", f.Name, err) 53 } 54 defer file.Close() 55 56 log.Debugf("Parsing manifest file %s\n", f.Name) 57 58 rd := textproto.NewReader((bufio.NewReader(NewOmitEmptyLinesReader(file)))) 59 h, err := rd.ReadMIMEHeader() 60 // MIME header require \n\n in the end, while MANIFEST.mf might not have this. Headers before are 61 // parsed correctly anyway, so skip the error and continue. 62 if err != nil && !errors.Is(err, io.EOF) { 63 return manifest{}, fmt.Errorf("failed to read MIME header: %w", err) 64 } 65 66 artifactID := getArtifactID(h) 67 groupID := getGroupID(h) 68 // Some known packages have incorrect group IDs. Check if this is the case and update the group ID. 69 if newGroupID := groupid.FromArtifactID(artifactID); newGroupID != "" { 70 groupID = newGroupID 71 } 72 73 return manifest{ 74 GroupID: groupID, 75 ArtifactID: artifactID, 76 Version: getVersion(h), 77 }, nil 78 } 79 80 var ( 81 groupIDFinder = regexp.MustCompile(`[a-zA-Z0-9-_.]+`) 82 ) 83 84 // Transforms for manifest fields that need a little work to extract the group 85 // ID. Note that we intentionally do not combine this as part of the `keys` in 86 // the `getGroupID` function because we want to maintain ordering of the keys to 87 // preserve priority of fields. 88 var groupIDTransforms = map[string]func(string) string{ 89 90 // The `Implementation-Title` field can have the group ID as the first part of 91 // the value, with other info appended to it. To extract it, we simply try to 92 // pull out the first domain-like string in the value. 93 // 94 // For example, elasticsearch-8.14.3.jar has a manifest with the following: 95 // 96 // Implementation-Title: org.elasticsearch#server;8.14.3 97 // 98 // And we simply want to extract `org.elasticsearch`, which would be the first 99 // match for the regex. 100 "Implementation-Title": func(s string) string { 101 // Get the first match for a domain-like string. 102 return groupIDFinder.FindString(s) 103 }, 104 } 105 106 func getGroupID(h textproto.MIMEHeader) string { 107 keys := []string{ 108 "Bundle-SymbolicName", 109 "Extension-Name", 110 "Specification-Vendor", 111 "Implementation-Vendor", 112 "Implementation-Vendor-Id", 113 "Implementation-Title", 114 "Bundle-Activator", 115 "Automatic-Module-Name", 116 "Main-Class", 117 "Package", 118 } 119 log.Debug("Potential group ids:") 120 for _, k := range keys { 121 log.Debugf(" %s: %s\n", k, h.Get(k)) 122 } 123 124 g := getFirstValidGroupID(h, keys) 125 if strings.Contains(g, ";") { 126 g = strings.Split(g, ";")[0] 127 } 128 return g 129 } 130 131 func getFirstValidGroupID(h textproto.MIMEHeader, names []string) string { 132 for _, n := range names { 133 groupID := h.Get(n) 134 if transform, ok := groupIDTransforms[n]; ok { 135 groupID = transform(groupID) 136 } 137 if validGroupID(groupID) { 138 return strings.ToLower(groupID) 139 } 140 } 141 return "" 142 } 143 144 func validGroupID(name string) bool { 145 return name != "" && !strings.Contains(name, " ") 146 } 147 148 func getArtifactID(h textproto.MIMEHeader) string { 149 id := getArtifactIDForBundlePlugin(h) 150 if id != "" { 151 return id 152 } 153 154 id = getKnownArtifactIDFromName(h) 155 if id != "" { 156 return id 157 } 158 159 keys := []string{ 160 "Name", 161 "Implementation-Title", 162 "Specification-Title", 163 "Bundle-Name", 164 "Short-Name", 165 "Extension-Name", 166 } 167 log.Debug("Potential artifact ids:") 168 for _, k := range keys { 169 log.Debugf(" %s: %s\n", k, h.Get(k)) 170 } 171 return getFirstValidArtifactID(h, keys) 172 } 173 174 func getVersion(h textproto.MIMEHeader) string { 175 keys := []string{ 176 "Implementation-Version", 177 "Specification-Version", 178 "Plugin-Version", 179 "Bundle-Version", 180 } 181 log.Debug("Potential version:") 182 for _, k := range keys { 183 log.Debugf(" %s: %s\n", k, h.Get(k)) 184 } 185 186 // Some versions contain extra information like the build number or date. 187 // For example "1.4 1855 April 22 2006" 188 // We only want the first part. 189 version := getFirst(h, keys) 190 version = strings.Split(version, " ")[0] 191 192 return version 193 } 194 195 func getFirst(h textproto.MIMEHeader, names []string) string { 196 for _, n := range names { 197 if h.Get(n) != "" { 198 return h.Get(n) 199 } 200 } 201 return "" 202 } 203 204 // getArtifactIDForBundlePlugin returns the package name for an Apache Maven Bundle Plugin. 205 // 206 // For these plugins, the package name is the last part of `Bundle-SymbolicName`. 207 // For example, the package for `Bundle-SymbolicName: com.google.guava.failureaccess` is `failureaccess` 208 // https://svn.apache.org/repos/asf/felix/releases/maven-bundle-plugin-1.2.0/doc/maven-bundle-plugin-bnd.html 209 // https://felix.apache.org/documentation/subprojects/apache-felix-maven-bundle-plugin-bnd.html 210 func getArtifactIDForBundlePlugin(h textproto.MIMEHeader) string { 211 if h.Get("Created-By") != "Apache Maven Bundle Plugin" { 212 return "" 213 } 214 symbolicName := h.Get("Bundle-SymbolicName") 215 if symbolicName == "" { 216 return "" 217 } 218 parts := strings.Split(symbolicName, ".") 219 220 artifactID := parts[len(parts)-1] 221 if validArtifactID(artifactID) { 222 return artifactID 223 } 224 225 return "" 226 } 227 228 // getKnownArtifactIDFromName returns the artifact ID known packages that have an incorrect artifact ID. 229 // 230 // For example, the Apache Axis package has the following Name in it's manifest: 231 // 232 // Name: org/apache/axis 233 // 234 // But the correct artifact ID is `axis`. 235 func getKnownArtifactIDFromName(h textproto.MIMEHeader) string { 236 return nameToArtifactID[h.Get("Name")] 237 } 238 239 func getFirstValidArtifactID(h textproto.MIMEHeader, names []string) string { 240 for _, n := range names { 241 if validArtifactID(h.Get(n)) { 242 return h.Get(n) 243 } 244 } 245 return "" 246 } 247 248 func validArtifactID(name string) bool { 249 if name == "" || strings.Contains(name, " ") { 250 return false 251 } 252 253 // e.g. "${org.eclipse.jdt.annotation.bundleName}" 254 // b/298196886#comment9 255 if strings.HasPrefix(name, "$") { 256 return false 257 } 258 259 // e.g. "%pluginName" 260 // b/298196886#comment10 261 if strings.HasPrefix(name, "%") { 262 return false 263 } 264 265 return true 266 } 267 268 // NewOmitEmptyLinesReader returns a new reader that omits empty lines from the input reader. 269 func NewOmitEmptyLinesReader(r io.Reader) io.Reader { 270 pr, pw := io.Pipe() 271 272 go func() { 273 defer pw.Close() 274 scanner := bufio.NewScanner(r) 275 for scanner.Scan() { 276 line := scanner.Text() 277 if line != "" { 278 _, _ = pw.Write([]byte(line + "\n")) 279 } 280 } 281 if err := scanner.Err(); err != nil { 282 _ = pw.CloseWithError(err) 283 } 284 }() 285 286 return pr 287 }