github.com/google/osv-scalibr@v0.4.1/clients/datasource/pypi_registry.go (about) 1 // Copyright 2025 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package datasource 16 17 import ( 18 "context" 19 "encoding/json" 20 "fmt" 21 "io" 22 "net/http" 23 "net/url" 24 "os" 25 "path" 26 "path/filepath" 27 "slices" 28 "strings" 29 "sync" 30 "time" 31 32 "github.com/google/osv-scalibr/clients/internal/pypi" 33 "github.com/google/osv-scalibr/log" 34 ) 35 36 // pyPIAPI holds the base of the URL of PyPI Index API. 37 const pyPIAPI = "https://pypi.org/simple" 38 39 // PyPIRegistryAPIClient defines a client to fetch metadata from a PyPI registry. 40 // TODO(#541): support multiple registries and authentication 41 type PyPIRegistryAPIClient struct { 42 registry string 43 localRegistry string 44 45 // Cache fields 46 mu *sync.Mutex 47 cacheTimestamp *time.Time // If set, this means we loaded from a cache 48 responses *RequestCache[string, response] 49 } 50 51 // NewPyPIRegistryAPIClient returns a new PyPIRegistryAPIClient. 52 func NewPyPIRegistryAPIClient(registry string, localRegistry string) *PyPIRegistryAPIClient { 53 if registry == "" { 54 registry = pyPIAPI 55 } 56 if localRegistry != "" { 57 localRegistry = filepath.Join(localRegistry, "pypi") 58 } 59 return &PyPIRegistryAPIClient{ 60 registry: registry, 61 localRegistry: localRegistry, 62 mu: &sync.Mutex{}, 63 responses: NewRequestCache[string, response](), 64 } 65 } 66 67 // SetLocalRegistry sets the local directory that stores the downloaded PyPI manifests. 68 func (p *PyPIRegistryAPIClient) SetLocalRegistry(localRegistry string) { 69 if localRegistry != "" { 70 localRegistry = filepath.Join(localRegistry, "pypi") 71 } 72 p.localRegistry = localRegistry 73 } 74 75 // GetIndex queries the simple API index for a given project. 76 func (p *PyPIRegistryAPIClient) GetIndex(ctx context.Context, project string) (pypi.IndexResponse, error) { 77 reqPath, err := url.JoinPath(p.registry, project) 78 if err != nil { 79 return pypi.IndexResponse{}, err 80 } 81 82 // The Index API requires an ending slash. 83 if !strings.HasSuffix(reqPath, "/") { 84 reqPath += "/" 85 } 86 87 var indexResp pypi.IndexResponse 88 resp, err := p.get(ctx, reqPath, true) 89 if err != nil { 90 return pypi.IndexResponse{}, err 91 } 92 err = json.Unmarshal(resp, &indexResp) 93 return indexResp, err 94 } 95 96 // GetFile retrieves the content of a file from the registry at the given URL. 97 func (p *PyPIRegistryAPIClient) GetFile(ctx context.Context, url string) ([]byte, error) { 98 return p.get(ctx, url, false) 99 } 100 101 // urlToPath converts a URL to a file path. 102 func urlToPath(rawURL string) string { 103 parsedURL, err := url.Parse(rawURL) 104 if err != nil { 105 log.Warnf("Error parsing URL %s: %s", rawURL, err) 106 return "" 107 } 108 return path.Join(parsedURL.Hostname(), parsedURL.Path) 109 } 110 111 func (p *PyPIRegistryAPIClient) get(ctx context.Context, url string, queryIndex bool) ([]byte, error) { 112 file := "" 113 urlPath := urlToPath(url) 114 if urlPath != "" && p.localRegistry != "" { 115 file = filepath.Join(p.localRegistry, urlPath) 116 if content, err := os.ReadFile(file); err == nil { 117 // We can still fetch the file from upstream if error is not nil. 118 return content, nil 119 } 120 } 121 122 resp, err := p.responses.Get(url, func() (response, error) { 123 req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) 124 if err != nil { 125 return response{}, err 126 } 127 128 if queryIndex { 129 req.Header.Set("Accept", "application/vnd.pypi.simple.v1+json") 130 } 131 resp, err := http.DefaultClient.Do(req) 132 if err != nil { 133 return response{}, fmt.Errorf("%w: PyPI registry query failed: %w", errAPIFailed, err) 134 } 135 defer resp.Body.Close() 136 137 if !slices.Contains([]int{http.StatusOK, http.StatusNotFound, http.StatusUnauthorized}, resp.StatusCode) { 138 // Only cache responses with Status OK, NotFound, or Unauthorized 139 return response{}, fmt.Errorf("%w: PyPI registry query status: %d", errAPIFailed, resp.StatusCode) 140 } 141 142 b, err := io.ReadAll(resp.Body) 143 if err != nil { 144 return response{}, fmt.Errorf("failed to read body: %w", err) 145 } 146 147 if file != "" { 148 if err := writeFile(file, b); err != nil { 149 log.Warnf("failed to write response of %s: %v", url, err) 150 } 151 } 152 153 return response{StatusCode: resp.StatusCode, Body: b}, nil 154 }) 155 if err != nil { 156 return nil, err 157 } 158 159 if resp.StatusCode != http.StatusOK { 160 return nil, fmt.Errorf("%w: PyPI registry query status: %d", errAPIFailed, resp.StatusCode) 161 } 162 163 return resp.Body, nil 164 }