github.com/zppinho/prow@v0.0.0-20240510014325-1738badeb017/pkg/git/v2/client_factory.go (about)

     1  /*
     2  Copyright 2019 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package git
    18  
    19  import (
    20  	"fmt"
    21  	"os"
    22  	"os/exec"
    23  	"path"
    24  	"runtime"
    25  	"sync"
    26  	"time"
    27  
    28  	"github.com/prometheus/client_golang/prometheus"
    29  	"github.com/sirupsen/logrus"
    30  	"k8s.io/apimachinery/pkg/util/sets"
    31  	utilpointer "k8s.io/utils/pointer"
    32  )
    33  
    34  var gitMetrics = struct {
    35  	ensureFreshPrimaryDuration *prometheus.HistogramVec
    36  	fetchByShaDuration         *prometheus.HistogramVec
    37  	secondaryCloneDuration     *prometheus.HistogramVec
    38  	sparseCheckoutDuration     prometheus.Histogram
    39  }{
    40  	ensureFreshPrimaryDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{
    41  		Name:    "git_ensure_fresh_primary_duration",
    42  		Help:    "Histogram of seconds spent ensuring that the primary is fresh, by org and repo.",
    43  		Buckets: []float64{0.5, 1, 2, 5, 10, 20, 30, 45, 60, 90, 120, 180, 300, 450, 600, 750, 900, 1050, 1200},
    44  	}, []string{
    45  		"org", "repo",
    46  	}),
    47  	fetchByShaDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{
    48  		Name:    "git_fetch_by_sha_duration",
    49  		Help:    "Histogram of seconds spent fetching commit SHAs, by org and repo.",
    50  		Buckets: []float64{0.5, 1, 2, 5, 10, 20, 30, 45, 60, 90, 120, 180, 300, 450, 600, 750, 900, 1050, 1200},
    51  	}, []string{
    52  		"org", "repo",
    53  	}),
    54  	secondaryCloneDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{
    55  		Name:    "git_secondary_clone_duration",
    56  		Help:    "Histogram of seconds spent creating the secondary clone, by org and repo.",
    57  		Buckets: []float64{0.5, 1, 2, 5, 10, 20, 30, 45, 60, 90},
    58  	}, []string{
    59  		"org", "repo",
    60  	}),
    61  	sparseCheckoutDuration: prometheus.NewHistogram(prometheus.HistogramOpts{
    62  		Name:    "sparse_checkout_duration",
    63  		Help:    "Histogram of seconds spent performing sparse checkout for a repository",
    64  		Buckets: []float64{0.5, 1, 2, 5, 10, 20, 30, 45, 60, 90},
    65  	}),
    66  }
    67  
    68  func init() {
    69  	prometheus.MustRegister(gitMetrics.ensureFreshPrimaryDuration)
    70  	prometheus.MustRegister(gitMetrics.fetchByShaDuration)
    71  	prometheus.MustRegister(gitMetrics.secondaryCloneDuration)
    72  	prometheus.MustRegister(gitMetrics.sparseCheckoutDuration)
    73  }
    74  
    75  // ClientFactory knows how to create clientFactory for repos
    76  type ClientFactory interface {
    77  	// ClientFromDir creates a client that operates on a repo that has already
    78  	// been cloned to the given directory.
    79  	ClientFromDir(org, repo, dir string) (RepoClient, error)
    80  	// ClientFor creates a client that operates on a new clone of the repo.
    81  	ClientFor(org, repo string) (RepoClient, error)
    82  	// ClientForWithRepoOpts is like ClientFor, but allows you to customize the
    83  	// setup of the cloned repo (such as sparse checkouts instead of using the
    84  	// default full clone).
    85  	ClientForWithRepoOpts(org, repo string, repoOpts RepoOpts) (RepoClient, error)
    86  
    87  	// Clean removes the caches used to generate clients
    88  	Clean() error
    89  }
    90  
    91  // RepoClient exposes interactions with a git repo
    92  type RepoClient interface {
    93  	Publisher
    94  	Interactor
    95  }
    96  
    97  type repoClient struct {
    98  	publisher
    99  	interactor
   100  }
   101  
   102  type ClientFactoryOpts struct {
   103  	// Host, defaults to "github.com" if unset
   104  	Host string
   105  	// Whether to use HTTP. By default, HTTPS is used (overrides UseSSH).
   106  	//
   107  	// TODO (listx): Combine HTTPS, HTTP, and SSH schemes into a single enum.
   108  	UseInsecureHTTP *bool
   109  	// UseSSH, defaults to false
   110  	UseSSH *bool
   111  	// The directory in which the cache should be
   112  	// created. Defaults to the "/var/tmp" on
   113  	// Linux and os.TempDir otherwise
   114  	CacheDirBase *string
   115  	// If unset, publishing action will error
   116  	Username LoginGetter
   117  	// If unset, publishing action will error
   118  	Token TokenGetter
   119  	// The git user to use.
   120  	GitUser GitUserGetter
   121  	// The censor to use. Not needed for anonymous
   122  	// actions.
   123  	Censor Censor
   124  	// Path to the httpCookieFile that will be used to authenticate client
   125  	CookieFilePath string
   126  	// If set, cacheDir persist. Otherwise temp dir will be used for CacheDir
   127  	Persist *bool
   128  }
   129  
   130  // These options are scoped to the repo, not the ClientFactory level. The reason
   131  // for the separation is to allow a single process to have for example repos
   132  // that are both sparsely checked out and non-sparsely checked out.
   133  type RepoOpts struct {
   134  	// sparseCheckoutDirs is the list of directories that the working tree
   135  	// should have. If non-nil and empty, then the working tree only has files
   136  	// reachable from the root. If non-nil and non-empty, then those additional
   137  	// directories from the root are also checked out (populated) in the working
   138  	// tree, recursively.
   139  	SparseCheckoutDirs []string
   140  	// This is the `--share` flag to `git clone`. For cloning from a local
   141  	// source, it allows bypassing the copying of all objects. If this is true,
   142  	// you must also set NeededCommits to a non-empty value; otherwise, when the
   143  	// primary is updated with RemoteUpdate() the `--prune` flag may end up
   144  	// deleting objects in the primary (which could adversely affect the
   145  	// secondary).
   146  	ShareObjectsWithPrimaryClone bool
   147  	// NeededCommits list only those commit SHAs which are needed. If the commit
   148  	// already exists, it is not fetched to save network costs. If NeededCommits
   149  	// is set, we do not call RemoteUpdate() for the primary clone (git cache).
   150  	NeededCommits sets.Set[string]
   151  	// BranchesToRetarget contains a map of branch names mapped to SHAs. These
   152  	// branch name and SHA pairs will be fed into RetargetBranch in the git v2
   153  	// client, to update the current HEAD of each branch.
   154  	BranchesToRetarget map[string]string
   155  }
   156  
   157  // Apply allows to use a ClientFactoryOpts as Opt
   158  func (cfo *ClientFactoryOpts) Apply(target *ClientFactoryOpts) {
   159  	if cfo.Host != "" {
   160  		target.Host = cfo.Host
   161  	}
   162  	if cfo.UseInsecureHTTP != nil {
   163  		target.UseInsecureHTTP = cfo.UseInsecureHTTP
   164  	}
   165  	if cfo.UseSSH != nil {
   166  		target.UseSSH = cfo.UseSSH
   167  	}
   168  	if cfo.CacheDirBase != nil {
   169  		target.CacheDirBase = cfo.CacheDirBase
   170  	}
   171  	if cfo.Token != nil {
   172  		target.Token = cfo.Token
   173  	}
   174  	if cfo.GitUser != nil {
   175  		target.GitUser = cfo.GitUser
   176  	}
   177  	if cfo.Censor != nil {
   178  		target.Censor = cfo.Censor
   179  	}
   180  	if cfo.Username != nil {
   181  		target.Username = cfo.Username
   182  	}
   183  	if cfo.CookieFilePath != "" {
   184  		target.CookieFilePath = cfo.CookieFilePath
   185  	}
   186  	if cfo.Persist != nil {
   187  		target.Persist = cfo.Persist
   188  	}
   189  }
   190  
   191  func defaultTempDir() *string {
   192  	switch runtime.GOOS {
   193  	case "linux":
   194  		return utilpointer.String("/var/tmp")
   195  	default:
   196  		return utilpointer.String("")
   197  	}
   198  }
   199  
   200  // ClientFactoryOpts allows to manipulate the options for a ClientFactory
   201  type ClientFactoryOpt func(*ClientFactoryOpts)
   202  
   203  func defaultClientFactoryOpts(cfo *ClientFactoryOpts) {
   204  	if cfo.Host == "" {
   205  		cfo.Host = "github.com"
   206  	}
   207  	if cfo.CacheDirBase == nil {
   208  		// If we do not have a place to put cache, put it in temp dir.
   209  		cfo.CacheDirBase = defaultTempDir()
   210  	}
   211  	if cfo.Censor == nil {
   212  		cfo.Censor = func(in []byte) []byte { return in }
   213  	}
   214  }
   215  
   216  // NewClientFactory allows for the creation of repository clients. It uses github.com
   217  // without authentication by default, if UseSSH then returns
   218  // sshRemoteResolverFactory, and if CookieFilePath is provided then returns
   219  // gerritResolverFactory(Assuming that git http.cookiefile is used only by
   220  // Gerrit, this function needs to be updated if it turned out that this
   221  // assumtpion is not correct.)
   222  func NewClientFactory(opts ...ClientFactoryOpt) (ClientFactory, error) {
   223  	o := ClientFactoryOpts{}
   224  	defaultClientFactoryOpts(&o)
   225  	for _, opt := range opts {
   226  		opt(&o)
   227  	}
   228  
   229  	if o.CookieFilePath != "" {
   230  		if output, err := exec.Command("git", "config", "--global", "http.cookiefile", o.CookieFilePath).CombinedOutput(); err != nil {
   231  			return nil, fmt.Errorf("unable to configure http.cookiefile.\nOutput: %s\nError: %w", string(output), err)
   232  		}
   233  	}
   234  
   235  	var cacheDir string
   236  	var err error
   237  	// If we want to persist the Cache between runs, use the cacheDirBase as the cache. Otherwise make a temp dir.
   238  	if o.Persist != nil && *o.Persist {
   239  		cacheDir = *o.CacheDirBase
   240  	} else if cacheDir, err = os.MkdirTemp(*o.CacheDirBase, "gitcache"); err != nil {
   241  		return nil, err
   242  	}
   243  
   244  	var remote RemoteResolverFactory
   245  	if o.UseSSH != nil && *o.UseSSH {
   246  		remote = &sshRemoteResolverFactory{
   247  			host:     o.Host,
   248  			username: o.Username,
   249  		}
   250  	} else if o.CookieFilePath != "" {
   251  		remote = &gerritResolverFactory{}
   252  	} else {
   253  		remote = &httpResolverFactory{
   254  			host:     o.Host,
   255  			http:     o.UseInsecureHTTP != nil && *o.UseInsecureHTTP,
   256  			username: o.Username,
   257  			token:    o.Token,
   258  		}
   259  	}
   260  	return &clientFactory{
   261  		cacheDir:       cacheDir,
   262  		cacheDirBase:   *o.CacheDirBase,
   263  		remote:         remote,
   264  		gitUser:        o.GitUser,
   265  		censor:         o.Censor,
   266  		masterLock:     &sync.Mutex{},
   267  		repoLocks:      map[string]*sync.Mutex{},
   268  		logger:         logrus.WithField("client", "git"),
   269  		cookieFilePath: o.CookieFilePath,
   270  	}, nil
   271  }
   272  
   273  // NewLocalClientFactory allows for the creation of repository clients
   274  // based on a local filepath remote for testing
   275  func NewLocalClientFactory(baseDir string, gitUser GitUserGetter, censor Censor) (ClientFactory, error) {
   276  	cacheDir, err := os.MkdirTemp("", "gitcache")
   277  	if err != nil {
   278  		return nil, err
   279  	}
   280  	return &clientFactory{
   281  		cacheDir:   cacheDir,
   282  		remote:     &pathResolverFactory{baseDir: baseDir},
   283  		gitUser:    gitUser,
   284  		censor:     censor,
   285  		masterLock: &sync.Mutex{},
   286  		repoLocks:  map[string]*sync.Mutex{},
   287  		logger:     logrus.WithField("client", "git"),
   288  	}, nil
   289  }
   290  
   291  type clientFactory struct {
   292  	remote         RemoteResolverFactory
   293  	gitUser        GitUserGetter
   294  	censor         Censor
   295  	logger         *logrus.Entry
   296  	cookieFilePath string
   297  
   298  	// cacheDir is the root under which cached clones of repos are created
   299  	cacheDir string
   300  	// cacheDirBase is the basedir under which create tempdirs
   301  	cacheDirBase string
   302  	// masterLock guards mutations to the repoLocks records
   303  	masterLock *sync.Mutex
   304  	// repoLocks guard mutating access to subdirectories under the cacheDir
   305  	repoLocks map[string]*sync.Mutex
   306  }
   307  
   308  // bootstrapClients returns a repository client and cloner for a dir.
   309  func (c *clientFactory) bootstrapClients(org, repo, dir string) (cacher, cloner, RepoClient, error) {
   310  	if dir == "" {
   311  		workdir, err := os.Getwd()
   312  		if err != nil {
   313  			return nil, nil, nil, err
   314  		}
   315  		dir = workdir
   316  	}
   317  	logger := c.logger.WithFields(logrus.Fields{"org": org, "repo": repo})
   318  	logger.WithField("dir", dir).Debug("Creating a pre-initialized client.")
   319  	executor, err := NewCensoringExecutor(dir, c.censor, logger)
   320  	if err != nil {
   321  		return nil, nil, nil, err
   322  	}
   323  	client := &repoClient{
   324  		publisher: publisher{
   325  			remotes: remotes{
   326  				publishRemote: c.remote.PublishRemote(org, repo),
   327  				centralRemote: c.remote.CentralRemote(org, repo),
   328  			},
   329  			executor: executor,
   330  			info:     c.gitUser,
   331  			logger:   logger,
   332  		},
   333  		interactor: interactor{
   334  			dir:      dir,
   335  			remote:   c.remote.CentralRemote(org, repo),
   336  			executor: executor,
   337  			logger:   logger,
   338  		},
   339  	}
   340  	return client, client, client, nil
   341  }
   342  
   343  // ClientFromDir returns a repository client for a directory that's already initialized with content.
   344  // If the directory isn't specified, the current working directory is used.
   345  func (c *clientFactory) ClientFromDir(org, repo, dir string) (RepoClient, error) {
   346  	_, _, client, err := c.bootstrapClients(org, repo, dir)
   347  	return client, err
   348  }
   349  
   350  // ClientFor wraps around ClientForWithRepoOpts using the default RepoOpts{}
   351  // (empty value). Originally, ClientFor was not a wrapper at all and did the
   352  // work inside ClientForWithRepoOpts itself, but it did this without RepoOpts.
   353  // When RepoOpts was created, we made ClientFor wrap around
   354  // ClientForWithRepoOpts to preserve behavior of existing callers of ClientFor.
   355  func (c *clientFactory) ClientFor(org, repo string) (RepoClient, error) {
   356  	return c.ClientForWithRepoOpts(org, repo, RepoOpts{})
   357  }
   358  
   359  // ClientForWithRepoOpts returns a repository client for the specified repository.
   360  // This function may take a long time if it is the first time cloning the repo.
   361  // In that case, it must do a full git mirror clone. For large repos, this can
   362  // take a while. Once that is done, it will do a git remote update (essentially
   363  // git fetch) for the mirror clone, which will usually take at most a few
   364  // seconds, before creating a secondary clone from this (updated) mirror.
   365  //
   366  // org and repo are used for determining where the repo is cloned, cloneURI
   367  // overrides org/repo for cloning.
   368  func (c *clientFactory) ClientForWithRepoOpts(org, repo string, repoOpts RepoOpts) (RepoClient, error) {
   369  	if repoOpts.ShareObjectsWithPrimaryClone && repoOpts.NeededCommits.Len() == 0 {
   370  		return nil, fmt.Errorf("programmer error: cannot share objects between primary and secondary without targeted fetches (NeededCommits)")
   371  	}
   372  
   373  	cacheDir := path.Join(c.cacheDir, org, repo)
   374  	c.logger.WithFields(logrus.Fields{"org": org, "repo": repo, "dir": cacheDir}).Debug("Creating a client from the cache.")
   375  	cacheClientCacher, _, _, err := c.bootstrapClients(org, repo, cacheDir)
   376  	if err != nil {
   377  		return nil, err
   378  	}
   379  
   380  	// Put copies of the repo in temp dir.
   381  	repoDir, err := os.MkdirTemp(*defaultTempDir(), "gitrepo")
   382  	if err != nil {
   383  		return nil, err
   384  	}
   385  	_, repoClientCloner, repoClient, err := c.bootstrapClients(org, repo, repoDir)
   386  	if err != nil {
   387  		return nil, err
   388  	}
   389  
   390  	// First create or update the primary clone (in "cacheDir").
   391  	timeBeforeEnsureFreshPrimary := time.Now()
   392  	err = c.ensureFreshPrimary(cacheDir, cacheClientCacher, repoOpts, org, repo)
   393  	if err != nil {
   394  		c.logger.WithFields(logrus.Fields{"org": org, "repo": repo, "dir": cacheDir}).Errorf("Error encountered while refreshing primary clone: %s", err.Error())
   395  	} else {
   396  		gitMetrics.ensureFreshPrimaryDuration.WithLabelValues(org, repo).Observe(time.Since(timeBeforeEnsureFreshPrimary).Seconds())
   397  	}
   398  
   399  	// Initialize the new derivative repo (secondary clone) from the primary
   400  	// clone. This is a local clone operation.
   401  	timeBeforeSecondaryClone := time.Now()
   402  	if err = repoClientCloner.CloneWithRepoOpts(cacheDir, repoOpts); err != nil {
   403  		return nil, err
   404  	}
   405  	gitMetrics.secondaryCloneDuration.WithLabelValues(org, repo).Observe(time.Since(timeBeforeSecondaryClone).Seconds())
   406  
   407  	return repoClient, nil
   408  }
   409  
   410  func (c *clientFactory) ensureFreshPrimary(
   411  	cacheDir string,
   412  	cacheClientCacher cacher,
   413  	repoOpts RepoOpts,
   414  	org string,
   415  	repo string,
   416  ) error {
   417  	if err := c.maybeCloneAndUpdatePrimary(cacheDir, cacheClientCacher, repoOpts); err != nil {
   418  		return err
   419  	}
   420  	// For targeted fetches by SHA objects, there's no need to hold a lock on
   421  	// the primary because it's safe to do so (git will first write to a
   422  	// temporary file and replace the file being written to, so if another git
   423  	// process already wrote to it, the worst case is that it will overwrite the
   424  	// file with the same data).  Targeted fetch. Only fetch those commits which
   425  	// we want, and only if they are missing.
   426  	if repoOpts.NeededCommits.Len() > 0 {
   427  		// Targeted fetch. Only fetch those commits which we want, and only if
   428  		// they are missing.
   429  		timeBeforeFetchBySha := time.Now()
   430  		if err := cacheClientCacher.FetchCommits(repoOpts.NeededCommits.UnsortedList()); err != nil {
   431  			return err
   432  		}
   433  		gitMetrics.fetchByShaDuration.WithLabelValues(org, repo).Observe(time.Since(timeBeforeFetchBySha).Seconds())
   434  
   435  		// Retarget branches. That is, make them point to a new SHA, so that the
   436  		// branches can get updated, even though we only fetch by SHA above.
   437  		//
   438  		// Because the branches never get used directly here, it's OK if this
   439  		// operation fails.
   440  		for branch, sha := range repoOpts.BranchesToRetarget {
   441  			if err := cacheClientCacher.RetargetBranch(branch, sha); err != nil {
   442  				c.logger.WithFields(logrus.Fields{"org": org, "repo": repo, "dir": cacheDir, "branch": branch}).WithError(err).Debug("failed to retarget branch")
   443  			}
   444  		}
   445  	}
   446  
   447  	return nil
   448  }
   449  
   450  // maybeCloneAndUpdatePrimary clones the primary if it doesn't exist yet, and
   451  // also runs a RemoteUpdate() against it if NeededCommits is empty. The
   452  // operations in this function are protected by a lock so that only one thread
   453  // can run at a given time for the same cacheDir (primary clone path).
   454  func (c *clientFactory) maybeCloneAndUpdatePrimary(cacheDir string, cacheClientCacher cacher, repoOpts RepoOpts) error {
   455  	// Protect access to the shared repoLocks map. The main point of all this
   456  	// locking is to ensure that we only try to create the primary clone (if it
   457  	// doesn't exist) in a serial manner.
   458  	var repoLock *sync.Mutex
   459  	c.masterLock.Lock()
   460  	if _, exists := c.repoLocks[cacheDir]; exists {
   461  		repoLock = c.repoLocks[cacheDir]
   462  	} else {
   463  		repoLock = &sync.Mutex{}
   464  		c.repoLocks[cacheDir] = repoLock
   465  	}
   466  	c.masterLock.Unlock()
   467  
   468  	repoLock.Lock()
   469  	defer repoLock.Unlock()
   470  	if _, err := os.Stat(path.Join(cacheDir, "HEAD")); os.IsNotExist(err) {
   471  		// we have not yet cloned this repo, we need to do a full clone
   472  		if err := os.MkdirAll(cacheDir, os.ModePerm); err != nil && !os.IsExist(err) {
   473  			return err
   474  		}
   475  		if err := cacheClientCacher.MirrorClone(); err != nil {
   476  			return err
   477  		}
   478  	} else if err != nil {
   479  		// something unexpected happened
   480  		return err
   481  	} else if repoOpts.NeededCommits.Len() == 0 {
   482  		// We have cloned the repo previously, but will refresh it. By default
   483  		// we refresh all refs with a call to `git remote update`.
   484  		//
   485  		// This is the default behavior if NeededCommits is empty or nil (i.e.,
   486  		// when we don't define a targeted list of commits to fetch directly).
   487  		//
   488  		// This call to RemoteUpdate() still needs to be protected by a lock
   489  		// because it updates possibly hundreds, if not thousands, of refs
   490  		// (quite literally, files in .git/refs/*).
   491  		if err := cacheClientCacher.RemoteUpdate(); err != nil {
   492  			return err
   493  		}
   494  	}
   495  
   496  	return nil
   497  }
   498  
   499  // Clean removes the caches used to generate clients
   500  func (c *clientFactory) Clean() error {
   501  	return os.RemoveAll(c.cacheDir)
   502  }