github.com/google/osv-scalibr@v0.4.1/clients/datasource/maven_registry.go (about)

     1  // Copyright 2025 Google LLC
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package datasource
    16  
    17  import (
    18  	"bytes"
    19  	"context"
    20  	"encoding/xml"
    21  	"errors"
    22  	"fmt"
    23  	"io"
    24  	"net/http"
    25  	"net/url"
    26  	"os"
    27  	"path/filepath"
    28  	"slices"
    29  	"strings"
    30  	"sync"
    31  	"time"
    32  
    33  	"deps.dev/util/maven"
    34  	"deps.dev/util/semver"
    35  	"github.com/google/osv-scalibr/log"
    36  	"golang.org/x/net/html/charset"
    37  	"golang.org/x/oauth2/google"
    38  )
    39  
    40  // mavenCentral holds the URL of Maven Central Repository.
    41  const mavenCentral = "https://repo.maven.apache.org/maven2"
    42  
    43  // artifactRegistryScheme defines the scheme for Google Artifact Registry.
    44  const artifactRegistryScheme = "artifactregistry"
    45  
    46  var errAPIFailed = errors.New("API query failed")
    47  
    48  // MavenRegistryAPIClient defines a client to fetch metadata from a Maven registry.
    49  type MavenRegistryAPIClient struct {
    50  	defaultRegistry MavenRegistry                  // The default registry that we are making requests
    51  	registries      []MavenRegistry                // Additional registries specified to fetch projects
    52  	registryAuths   map[string]*HTTPAuthentication // Authentication for the registries keyed by registry ID. From settings.xml
    53  	localRegistry   string                         // The local directory that holds Maven manifests
    54  
    55  	googleClient      *http.Client // A client for authenticating with Google services, used for Artifact Registry.
    56  	disableGoogleAuth bool         // If true, do not try to create google.DefaultClient for Artifact Registry.
    57  
    58  	// Cache fields
    59  	mu             *sync.Mutex
    60  	cacheTimestamp *time.Time // If set, this means we loaded from a cache
    61  	responses      *RequestCache[string, response]
    62  }
    63  
    64  type response struct {
    65  	StatusCode int
    66  	Body       []byte
    67  }
    68  
    69  // MavenRegistry defines a Maven registry.
    70  type MavenRegistry struct {
    71  	URL    string
    72  	Parsed *url.URL
    73  
    74  	// Information from pom.xml
    75  	ID               string
    76  	ReleasesEnabled  bool
    77  	SnapshotsEnabled bool
    78  }
    79  
    80  // NewMavenRegistryAPIClient returns a new MavenRegistryAPIClient.
    81  func NewMavenRegistryAPIClient(ctx context.Context, registry MavenRegistry, localRegistry string, disableGoogleClient bool) (*MavenRegistryAPIClient, error) {
    82  	if registry.URL == "" {
    83  		registry.URL = mavenCentral
    84  		registry.ID = "central"
    85  	}
    86  	if registry.ID == "" {
    87  		// Gives the default registry an ID so it is not overwritten by registry without an ID in pom.xml.
    88  		registry.ID = "default"
    89  	}
    90  	u, err := url.Parse(registry.URL)
    91  	if err != nil {
    92  		return nil, fmt.Errorf("invalid Maven registry %s: %w", registry.URL, err)
    93  	}
    94  	registry.Parsed = u
    95  
    96  	if localRegistry != "" {
    97  		localRegistry = filepath.Join(localRegistry, "maven")
    98  	}
    99  
   100  	// TODO: allow for manual specification of settings files
   101  	globalSettings := ParseMavenSettings(globalMavenSettingsFile())
   102  	userSettings := ParseMavenSettings(userMavenSettingsFile())
   103  
   104  	client := &MavenRegistryAPIClient{
   105  		// We assume only downloading releases is allowed on the default registry.
   106  		defaultRegistry:   registry,
   107  		localRegistry:     localRegistry,
   108  		mu:                &sync.Mutex{},
   109  		responses:         NewRequestCache[string, response](),
   110  		registryAuths:     MakeMavenAuth(globalSettings, userSettings),
   111  		disableGoogleAuth: disableGoogleClient,
   112  	}
   113  	if registry.Parsed.Scheme == artifactRegistryScheme {
   114  		client.createGoogleClient(ctx)
   115  	}
   116  	return client, nil
   117  }
   118  
   119  // NewDefaultMavenRegistryAPIClient creates a new MavenRegistryAPIClient with default settings,
   120  // using the provided registry URL.
   121  func NewDefaultMavenRegistryAPIClient(ctx context.Context, registry string) (*MavenRegistryAPIClient, error) {
   122  	return NewMavenRegistryAPIClient(ctx, MavenRegistry{URL: registry, ReleasesEnabled: true}, "", false)
   123  }
   124  
   125  // SetLocalRegistry sets the local directory that stores the downloaded Maven manifests.
   126  func (m *MavenRegistryAPIClient) SetLocalRegistry(localRegistry string) {
   127  	if localRegistry != "" {
   128  		localRegistry = filepath.Join(localRegistry, "maven")
   129  	}
   130  	m.localRegistry = localRegistry
   131  }
   132  
   133  // WithoutRegistries makes MavenRegistryAPIClient including its cache but not registries.
   134  func (m *MavenRegistryAPIClient) WithoutRegistries() *MavenRegistryAPIClient {
   135  	return &MavenRegistryAPIClient{
   136  		defaultRegistry:   m.defaultRegistry,
   137  		localRegistry:     m.localRegistry,
   138  		mu:                m.mu,
   139  		cacheTimestamp:    m.cacheTimestamp,
   140  		responses:         m.responses,
   141  		registryAuths:     m.registryAuths,
   142  		googleClient:      m.googleClient,
   143  		disableGoogleAuth: m.disableGoogleAuth,
   144  	}
   145  }
   146  
   147  // AddRegistry adds the given registry to the list of registries if it has not been added.
   148  func (m *MavenRegistryAPIClient) AddRegistry(ctx context.Context, registry MavenRegistry) error {
   149  	if registry.ID == m.defaultRegistry.ID {
   150  		return m.updateDefaultRegistry(ctx, registry)
   151  	}
   152  
   153  	for _, reg := range m.registries {
   154  		if reg.ID == registry.ID {
   155  			return nil
   156  		}
   157  	}
   158  
   159  	u, err := url.Parse(registry.URL)
   160  	if err != nil {
   161  		return err
   162  	}
   163  
   164  	registry.Parsed = u
   165  	m.registries = append(m.registries, registry)
   166  	if registry.Parsed.Scheme == artifactRegistryScheme {
   167  		m.createGoogleClient(ctx)
   168  	}
   169  
   170  	return nil
   171  }
   172  
   173  func (m *MavenRegistryAPIClient) updateDefaultRegistry(ctx context.Context, registry MavenRegistry) error {
   174  	u, err := url.Parse(registry.URL)
   175  	if err != nil {
   176  		return err
   177  	}
   178  	registry.Parsed = u
   179  	m.defaultRegistry = registry
   180  	if registry.Parsed.Scheme == artifactRegistryScheme {
   181  		m.createGoogleClient(ctx)
   182  	}
   183  	return nil
   184  }
   185  
   186  // createGoogleClient creates a client for authenticating with Google services.
   187  func (m *MavenRegistryAPIClient) createGoogleClient(ctx context.Context) {
   188  	if m.googleClient != nil || m.disableGoogleAuth {
   189  		return
   190  	}
   191  	// This is the scope that artifact-registry-go-tools uses.
   192  	// https://github.com/GoogleCloudPlatform/artifact-registry-go-tools/blob/main/pkg/auth/auth.go
   193  	client, err := google.DefaultClient(ctx, "https://www.googleapis.com/auth/cloud-platform")
   194  	if err != nil {
   195  		// We don't return an error here so that we can fall back to a regular http client.
   196  		log.Warnf("failed to create Google default client, Artifact Registry access will be unavailable: %v", err)
   197  		return
   198  	}
   199  	m.googleClient = client
   200  }
   201  
   202  // DisableGoogleAuth prevents the creation of a Google client for authentication purpose.
   203  func (m *MavenRegistryAPIClient) DisableGoogleAuth() {
   204  	m.disableGoogleAuth = true
   205  }
   206  
   207  // GetRegistries returns the registries added to this client.
   208  func (m *MavenRegistryAPIClient) GetRegistries() (registries []MavenRegistry) {
   209  	return m.registries
   210  }
   211  
   212  // GetProject fetches a pom.xml specified by groupID, artifactID and version and parses it to maven.Project.
   213  // Each registry in the list is tried until we find the project.
   214  // For a snapshot version, version level metadata is used to find the extact version string.
   215  // More about Maven Repository Metadata Model: https://maven.apache.org/ref/3.9.9/maven-repository-metadata/
   216  // More about Maven Metadata: https://maven.apache.org/repositories/metadata.html
   217  func (m *MavenRegistryAPIClient) GetProject(ctx context.Context, groupID, artifactID, version string) (maven.Project, error) {
   218  	if !strings.HasSuffix(version, "-SNAPSHOT") {
   219  		for _, registry := range append(m.registries, m.defaultRegistry) {
   220  			if !registry.ReleasesEnabled {
   221  				continue
   222  			}
   223  			project, err := m.getProject(ctx, registry, groupID, artifactID, version, "")
   224  			if err == nil {
   225  				return project, nil
   226  			}
   227  		}
   228  
   229  		return maven.Project{}, fmt.Errorf("failed to fetch Maven project %s:%s@%s", groupID, artifactID, version)
   230  	}
   231  
   232  	for _, registry := range append(m.registries, m.defaultRegistry) {
   233  		// Fetch version metadata for snapshot versions from the registries enabling that.
   234  		if !registry.SnapshotsEnabled {
   235  			continue
   236  		}
   237  		metadata, err := m.getVersionMetadata(ctx, registry, groupID, artifactID, version)
   238  		if err != nil {
   239  			continue
   240  		}
   241  
   242  		snapshot := ""
   243  		for _, sv := range metadata.Versioning.SnapshotVersions {
   244  			if sv.Extension == "pom" {
   245  				// We only look for pom.xml for project metadata.
   246  				snapshot = string(sv.Value)
   247  				break
   248  			}
   249  		}
   250  
   251  		project, err := m.getProject(ctx, registry, groupID, artifactID, version, snapshot)
   252  		if err == nil {
   253  			return project, nil
   254  		}
   255  	}
   256  
   257  	return maven.Project{}, fmt.Errorf("failed to fetch Maven project %s:%s@%s", groupID, artifactID, version)
   258  }
   259  
   260  // GetVersions returns the list of available versions of a Maven package specified by groupID and artifactID.
   261  // Versions found in all registries are unioned, then sorted by semver.
   262  func (m *MavenRegistryAPIClient) GetVersions(ctx context.Context, groupID, artifactID string) ([]maven.String, error) {
   263  	var versions []maven.String
   264  	for _, registry := range append(m.registries, m.defaultRegistry) {
   265  		metadata, err := m.getArtifactMetadata(ctx, registry, groupID, artifactID)
   266  		if err != nil {
   267  			continue
   268  		}
   269  		versions = append(versions, metadata.Versioning.Versions...)
   270  	}
   271  	slices.SortFunc(versions, func(a, b maven.String) int { return semver.Maven.Compare(string(a), string(b)) })
   272  
   273  	return slices.Compact(versions), nil
   274  }
   275  
   276  // getProject fetches a pom.xml specified by groupID, artifactID and version and parses it to maven.Project.
   277  // For snapshot versions, the exact version value is specified by snapshot.
   278  func (m *MavenRegistryAPIClient) getProject(ctx context.Context, registry MavenRegistry, groupID, artifactID, version, snapshot string) (maven.Project, error) {
   279  	if snapshot == "" {
   280  		snapshot = version
   281  	}
   282  
   283  	var project maven.Project
   284  	if err := m.get(ctx, m.registryAuths[registry.ID], registry, []string{strings.ReplaceAll(groupID, ".", "/"), artifactID, version, fmt.Sprintf("%s-%s.pom", artifactID, snapshot)}, &project); err != nil {
   285  		return maven.Project{}, err
   286  	}
   287  
   288  	return project, nil
   289  }
   290  
   291  // getVersionMetadata fetches a version level maven-metadata.xml and parses it to maven.Metadata.
   292  func (m *MavenRegistryAPIClient) getVersionMetadata(ctx context.Context, registry MavenRegistry, groupID, artifactID, version string) (maven.Metadata, error) {
   293  	var metadata maven.Metadata
   294  	if err := m.get(ctx, m.registryAuths[registry.ID], registry, []string{strings.ReplaceAll(groupID, ".", "/"), artifactID, version, "maven-metadata.xml"}, &metadata); err != nil {
   295  		return maven.Metadata{}, err
   296  	}
   297  
   298  	return metadata, nil
   299  }
   300  
   301  // GetArtifactMetadata fetches an artifact level maven-metadata.xml and parses it to maven.Metadata.
   302  func (m *MavenRegistryAPIClient) getArtifactMetadata(ctx context.Context, registry MavenRegistry, groupID, artifactID string) (maven.Metadata, error) {
   303  	var metadata maven.Metadata
   304  	if err := m.get(ctx, m.registryAuths[registry.ID], registry, []string{strings.ReplaceAll(groupID, ".", "/"), artifactID, "maven-metadata.xml"}, &metadata); err != nil {
   305  		return maven.Metadata{}, err
   306  	}
   307  
   308  	return metadata, nil
   309  }
   310  
   311  func (m *MavenRegistryAPIClient) get(ctx context.Context, auth *HTTPAuthentication, registry MavenRegistry, paths []string, dst any) error {
   312  	filePath := ""
   313  	if m.localRegistry != "" {
   314  		filePath = filepath.Join(append([]string{m.localRegistry}, paths...)...)
   315  		file, err := os.Open(filePath)
   316  		if err == nil {
   317  			defer file.Close()
   318  			// We can still fetch the file from upstream if error is not nil.
   319  			return NewMavenDecoder(file).Decode(dst)
   320  		}
   321  		if !os.IsNotExist(err) {
   322  			log.Warnf("Error reading from local cache %s: %v", filePath, err)
   323  		}
   324  	}
   325  
   326  	httpClient := http.DefaultClient
   327  	requestURL := *registry.Parsed
   328  	isArtifactRegistry := requestURL.Scheme == artifactRegistryScheme
   329  	if isArtifactRegistry {
   330  		requestURL.Scheme = "https"
   331  		// For Artifact Registry, use google.DefaultClient for ADC if available.
   332  		if m.googleClient != nil {
   333  			httpClient = m.googleClient
   334  		}
   335  	}
   336  
   337  	u := requestURL.JoinPath(paths...).String()
   338  	resp, err := m.responses.Get(u, func() (response, error) {
   339  		log.Infof("Fetching response from: %s", u)
   340  		resp, err := auth.Get(ctx, httpClient, u)
   341  		if err != nil {
   342  			return response{}, fmt.Errorf("%w: Maven registry query failed: %w", errAPIFailed, err)
   343  		}
   344  		defer resp.Body.Close()
   345  
   346  		if !slices.Contains([]int{http.StatusOK, http.StatusNotFound, http.StatusUnauthorized, http.StatusForbidden}, resp.StatusCode) {
   347  			// Only cache responses with Status OK, NotFound, Unauthorized, or Forbidden
   348  			return response{}, fmt.Errorf("%w: Maven registry query status: %d", errAPIFailed, resp.StatusCode)
   349  		}
   350  
   351  		b, err := io.ReadAll(resp.Body)
   352  		if err != nil {
   353  			return response{}, fmt.Errorf("failed to read body: %w", err)
   354  		}
   355  
   356  		if filePath != "" && resp.StatusCode == http.StatusOK {
   357  			if err := writeFile(filePath, b); err != nil {
   358  				log.Warnf("failed to write response to %s: %v", u, err)
   359  			}
   360  		}
   361  
   362  		return response{StatusCode: resp.StatusCode, Body: b}, nil
   363  	})
   364  	if err != nil {
   365  		log.Warnf("failed to get response from %s: %v", u, err)
   366  		return err
   367  	}
   368  
   369  	if resp.StatusCode == http.StatusForbidden && isArtifactRegistry {
   370  		return fmt.Errorf("%w: Maven registry query status: %d (Forbidden). Please check your Application Default Credentials (ADC) have permission to read from %s", errAPIFailed, resp.StatusCode, registry.URL)
   371  	}
   372  
   373  	if resp.StatusCode != http.StatusOK {
   374  		return fmt.Errorf("%w: Maven registry query status: %d", errAPIFailed, resp.StatusCode)
   375  	}
   376  
   377  	return NewMavenDecoder(bytes.NewReader(resp.Body)).Decode(dst)
   378  }
   379  
   380  // writeFile writes the bytes to the file specified by the given path.
   381  func writeFile(path string, data []byte) error {
   382  	dir := filepath.Dir(path)
   383  	// Create the directory if it doesn't exist.
   384  	if err := os.MkdirAll(dir, 0755); err != nil {
   385  		return fmt.Errorf("failed to create directory %s: %w", dir, err)
   386  	}
   387  
   388  	outFile, err := os.Create(path)
   389  	if err != nil {
   390  		return fmt.Errorf("failed to create file %s: %w", path, err)
   391  	}
   392  	defer outFile.Close()
   393  
   394  	if _, err := outFile.Write(data); err != nil {
   395  		return fmt.Errorf("failed to write file %s: %w", path, err)
   396  	}
   397  
   398  	return nil
   399  }
   400  
   401  // NewMavenDecoder returns an xml decoder with CharsetReader and Entity set.
   402  func NewMavenDecoder(reader io.Reader) *xml.Decoder {
   403  	decoder := xml.NewDecoder(reader)
   404  	// Set charset reader for conversion from non-UTF-8 charset into UTF-8.
   405  	decoder.CharsetReader = charset.NewReaderLabel
   406  	// Set HTML entity map for translation between non-standard entity names
   407  	// and string replacements.
   408  	decoder.Entity = xml.HTMLEntity
   409  
   410  	return decoder
   411  }