github.com/quay/claircore@v1.5.28/java/packagescanner.go (about) 1 // Package java contains components for interrogating java packages in 2 // container layers. 3 package java 4 5 import ( 6 "archive/zip" 7 "bytes" 8 "context" 9 "crypto/sha1" 10 "encoding/json" 11 "errors" 12 "fmt" 13 "io" 14 "net/http" 15 "net/url" 16 "runtime/trace" 17 "sort" 18 "strconv" 19 "strings" 20 "time" 21 22 "github.com/quay/zlog" 23 24 "github.com/quay/claircore" 25 "github.com/quay/claircore/indexer" 26 "github.com/quay/claircore/java/jar" 27 ) 28 29 var ( 30 _ indexer.VersionedScanner = (*Scanner)(nil) 31 _ indexer.PackageScanner = (*Scanner)(nil) 32 _ indexer.RPCScanner = (*Scanner)(nil) 33 _ indexer.DefaultRepoScanner = (*Scanner)(nil) 34 35 Repository = claircore.Repository{ 36 Name: "maven", 37 URI: "https://repo1.maven.apache.org/maven2", 38 } 39 ) 40 41 // DefaultSearchAPI is a maven-like REST API that may be used to do 42 // reverse lookups based on an archive's sha1 sum. 43 // 44 //doc:url indexer 45 const DefaultSearchAPI = `https://search.maven.org/solrsearch/select` 46 const DefaultRequestTimeout = 2 * time.Second 47 48 // ScannerConfig is the struct used to configure a Scanner. 49 type ScannerConfig struct { 50 // DisableAPI disables the use of the API. 51 DisableAPI bool `yaml:"disable_api" json:"disable_api"` 52 // API is a URL endpoint to a maven-like REST API. 53 // The default is DefaultSearchAPI. 54 API string `yaml:"api" json:"api"` 55 APIRequestTimeout time.Duration `yaml:"api_request_timeout" json:"api_request_timeout"` 56 } 57 58 // Scanner implements the scanner.PackageScanner interface. 59 // 60 // It looks for files that seem like jar, war or ear, and looks at the 61 // metadata recorded there. 62 // 63 // The zero value is ready to use. 64 type Scanner struct { 65 client *http.Client 66 root *url.URL 67 rootRequestTimeout time.Duration 68 } 69 70 // Name implements scanner.VersionedScanner. 71 func (*Scanner) Name() string { return "java" } 72 73 // Version implements scanner.VersionedScanner. 74 func (*Scanner) Version() string { return "6" } 75 76 // Kind implements scanner.VersionedScanner. 77 func (*Scanner) Kind() string { return "package" } 78 79 // Configure implements indexer.RPCScanner. 80 func (s *Scanner) Configure(ctx context.Context, f indexer.ConfigDeserializer, c *http.Client) error { 81 ctx = zlog.ContextWithValues(ctx, 82 "component", "java/Scanner.Configure", 83 "version", s.Version()) 84 var cfg ScannerConfig 85 s.client = c 86 if err := f(&cfg); err != nil { 87 return err 88 } 89 90 if cfg.DisableAPI { 91 zlog.Debug(ctx).Msg("search API disabled") 92 } else { 93 api := DefaultSearchAPI 94 if cfg.API != "" { 95 api = cfg.API 96 } 97 requestTimeout := DefaultRequestTimeout 98 if cfg.APIRequestTimeout != 0 { 99 requestTimeout = cfg.APIRequestTimeout 100 } 101 s.rootRequestTimeout = requestTimeout 102 zlog.Debug(ctx). 103 Str("api", api). 104 Float64("requestTimeout", requestTimeout.Seconds()). 105 Msg("configured search API URL") 106 u, err := url.Parse(api) 107 if err != nil { 108 return err 109 } 110 s.root = u 111 } 112 113 return nil 114 } 115 116 // Scan attempts to find jar, war or ear files and record the package 117 // information there. 118 // 119 // A return of (nil, nil) is expected if there's nothing found. 120 func (s *Scanner) Scan(ctx context.Context, layer *claircore.Layer) ([]*claircore.Package, error) { 121 defer trace.StartRegion(ctx, "Scanner.Scan").End() 122 trace.Log(ctx, "layer", layer.Hash.String()) 123 ctx = zlog.ContextWithValues(ctx, 124 "component", "java/Scanner.Scan", 125 "version", s.Version(), 126 "layer", layer.Hash.String()) 127 zlog.Debug(ctx).Msg("start") 128 defer zlog.Debug(ctx).Msg("done") 129 if err := ctx.Err(); err != nil { 130 return nil, err 131 } 132 sys, err := layer.FS() 133 if err != nil { 134 return nil, fmt.Errorf("java: unable to open layer: %w", err) 135 } 136 137 ars, err := archives(ctx, sys) 138 if err != nil { 139 return nil, err 140 } 141 // All used in the loop below. 142 var ret []*claircore.Package 143 buf := getBuf() 144 sh := sha1.New() 145 ck := make([]byte, sha1.Size) 146 doSearch := s.root != nil 147 defer putBuf(buf) 148 for _, n := range ars { 149 ctx := zlog.ContextWithValues(ctx, "file", n) 150 sh.Reset() 151 buf.Reset() 152 // Calculate the SHA1 as it's buffered, since it may be needed for 153 // searching later. 154 f, err := sys.Open(n) 155 if err != nil { 156 return nil, err 157 } 158 fStat, err := f.Stat() 159 if err == nil { 160 buf.Grow(int(fStat.Size())) 161 } 162 sz, err := buf.ReadFrom(io.TeeReader(f, sh)) 163 f.Close() 164 if err != nil { 165 return nil, err 166 } 167 zb := buf.Bytes() 168 if !bytes.Equal(zb[:4], jar.Header) { 169 // Has a reasonable size and name, but isn't really a zip. 170 zlog.Debug(ctx).Msg("not actually a jar: bad header") 171 continue 172 } 173 z, err := zip.NewReader(bytes.NewReader(zb), sz) 174 switch { 175 case errors.Is(err, nil): 176 case errors.Is(err, zip.ErrFormat): 177 zlog.Info(ctx). 178 Err(err). 179 Msg("not actually a jar: invalid zip") 180 continue 181 default: 182 return nil, err 183 } 184 185 infos, err := jar.Parse(ctx, n, z) 186 switch { 187 case err == nil: 188 case errors.Is(err, jar.ErrUnidentified) || errors.Is(err, jar.ErrNotAJar): 189 // If there's an error that's one of the "known" reasons (e.g. not a 190 // read error or a malformed file), just log it and continue on. 191 zlog.Info(ctx). 192 AnErr("reason", err). 193 Msg("skipping jar") 194 continue 195 default: 196 return nil, err 197 } 198 sh.Sum(ck[:0]) 199 ps := make([]*claircore.Package, len(infos)) 200 for j := range infos { 201 i := &infos[j] 202 // If we discovered a pom file, don't bother talking to the network. 203 // If not, talk to the network if configured to do so. 204 if !strings.HasSuffix(i.Source, "pom.properties") && doSearch { 205 switch err := s.search(ctx, i, ck); { 206 case errors.Is(err, nil): // OK 207 case errors.Is(err, errRPC): 208 // BUG(hank) There's no way for a scanner that makes RPC calls 209 // to signal "the call failed, these are best-effort results, 210 // and please retry." 211 default: 212 return nil, err 213 } 214 } 215 216 var pkg claircore.Package 217 pkg.Name = i.Name 218 pkg.Version = i.Version 219 pkg.Kind = claircore.BINARY 220 pkg.Filepath = n 221 b := ck 222 if len(i.SHA) != 0 { 223 b = i.SHA 224 } 225 pkg.RepositoryHint = fmt.Sprintf(`sha1:%40x`, b) 226 // BUG(hank) There's probably some bugs lurking in the jar.Info → 227 // claircore.Package mapping code around embedded jars. There's a 228 // testcase to be written, there. 229 230 // Only examine the last element of the source list: 231 js := strings.Split(i.Source, ":") 232 switch l := js[len(js)-1]; { 233 case strings.HasSuffix(l, "pom.properties"): 234 fallthrough 235 case s.root != nil && i.Source == s.root.String(): 236 // Populate as a maven artifact. 237 pkg.PackageDB = `maven:` + n 238 case l == "META-INF/MANIFEST.MF": 239 // information pulled from a manifest file 240 pkg.PackageDB = `jar:` + n 241 case l == ".": 242 // Name guess. 243 pkg.PackageDB = `file:` + n 244 default: 245 return nil, fmt.Errorf("java: martian Info: %+v", i) 246 } 247 ps[j] = &pkg 248 } 249 ret = append(ret, ps...) 250 } 251 return ret, nil 252 } 253 254 // DefaultRepository implements [indexer.DefaultRepoScanner]. 255 func (Scanner) DefaultRepository(ctx context.Context) *claircore.Repository { 256 return &Repository 257 } 258 259 // Search attempts to search with the configured client and API endpoint. 260 // 261 // This function modifies the passed Info in-place if successful. The passed 262 // byte slice should be a SHA1 sum of the jar. It is used if the "SHA" member of 263 // the Info is not populated. 264 // 265 // ErrRPC is reported if anything went wrong making the request or reading the 266 // response. 267 func (s *Scanner) search(ctx context.Context, i *jar.Info, ck []byte) error { 268 if i.SHA != nil { 269 ck = i.SHA 270 } 271 success := false 272 defer func() { 273 searchCounter.WithLabelValues(strconv.FormatBool(success)).Inc() 274 }() 275 tctx, done := context.WithTimeout(ctx, s.rootRequestTimeout) 276 defer done() 277 req, err := http.NewRequestWithContext(tctx, http.MethodGet, s.root.String(), nil) 278 if err != nil { 279 zlog.Warn(ctx). 280 Err(err). 281 Msg("unable to construct request") 282 return errRPC 283 } 284 v := req.URL.Query() 285 // 40 == 2 * sha1.Size. I don't there's a good way to keep it as 286 // a constant. 287 v.Set("q", fmt.Sprintf(`1:"%40x"`, ck)) 288 v.Set("wt", "json") 289 req.URL.RawQuery = v.Encode() 290 res, err := s.client.Do(req) 291 if err != nil { 292 zlog.Warn(ctx). 293 Err(err). 294 Msg("error making request") 295 return errRPC 296 } 297 if res.StatusCode != http.StatusOK { 298 res.Body.Close() 299 zlog.Warn(ctx). 300 Str("status", res.Status). 301 Msg("unexpected response status") 302 return errRPC 303 } 304 var sr searchResponse 305 err = json.NewDecoder(res.Body).Decode(&sr) 306 res.Body.Close() 307 if err != nil { 308 zlog.Warn(ctx). 309 Err(err). 310 Msg("error decoding json") 311 return errRPC 312 } 313 success = true 314 if len(sr.Response.Doc) == 0 { 315 zlog.Debug(ctx).Msg("no matching artifacts found") 316 return nil 317 } 318 // Sort and then take the first one, because apparently the same 319 // artifact is uploaded under different names sometimes? 320 sort.SliceStable(sr.Response.Doc, func(i, j int) bool { 321 return sr.Response.Doc[i].ID < sr.Response.Doc[j].ID 322 }) 323 i.Source = s.root.String() 324 d := &sr.Response.Doc[0] 325 i.Version = d.Version 326 i.Name = d.Group + ":" + d.Artifact 327 return nil 328 } 329 330 var errRPC = errors.New("search rpc failed") 331 332 // SearchResponse is the response from maven. 333 // 334 // Created by eyeballing the response from 335 // https://search.maven.org/solrsearch/select?q=1:%2235379fb6526fd019f331542b4e9ae2e566c57933%22&wt=json 336 type searchResponse struct { 337 Response struct { 338 Doc []struct { 339 ID string `json:"id"` 340 Group string `json:"g"` 341 Artifact string `json:"a"` 342 Version string `json:"v"` 343 Classifier string `json:"p"` 344 } `json:"docs"` 345 } `json:"response"` 346 }