github.com/zppinho/prow@v0.0.0-20240510014325-1738badeb017/pkg/git/v2/client_factory.go (about) 1 /* 2 Copyright 2019 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package git 18 19 import ( 20 "fmt" 21 "os" 22 "os/exec" 23 "path" 24 "runtime" 25 "sync" 26 "time" 27 28 "github.com/prometheus/client_golang/prometheus" 29 "github.com/sirupsen/logrus" 30 "k8s.io/apimachinery/pkg/util/sets" 31 utilpointer "k8s.io/utils/pointer" 32 ) 33 34 var gitMetrics = struct { 35 ensureFreshPrimaryDuration *prometheus.HistogramVec 36 fetchByShaDuration *prometheus.HistogramVec 37 secondaryCloneDuration *prometheus.HistogramVec 38 sparseCheckoutDuration prometheus.Histogram 39 }{ 40 ensureFreshPrimaryDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{ 41 Name: "git_ensure_fresh_primary_duration", 42 Help: "Histogram of seconds spent ensuring that the primary is fresh, by org and repo.", 43 Buckets: []float64{0.5, 1, 2, 5, 10, 20, 30, 45, 60, 90, 120, 180, 300, 450, 600, 750, 900, 1050, 1200}, 44 }, []string{ 45 "org", "repo", 46 }), 47 fetchByShaDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{ 48 Name: "git_fetch_by_sha_duration", 49 Help: "Histogram of seconds spent fetching commit SHAs, by org and repo.", 50 Buckets: []float64{0.5, 1, 2, 5, 10, 20, 30, 45, 60, 90, 120, 180, 300, 450, 600, 750, 900, 1050, 1200}, 51 }, []string{ 52 "org", "repo", 53 }), 54 secondaryCloneDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{ 55 Name: "git_secondary_clone_duration", 56 Help: "Histogram of seconds spent creating the secondary clone, by org and repo.", 57 Buckets: []float64{0.5, 1, 2, 5, 10, 20, 30, 45, 60, 90}, 58 }, []string{ 59 "org", "repo", 60 }), 61 sparseCheckoutDuration: prometheus.NewHistogram(prometheus.HistogramOpts{ 62 Name: "sparse_checkout_duration", 63 Help: "Histogram of seconds spent performing sparse checkout for a repository", 64 Buckets: []float64{0.5, 1, 2, 5, 10, 20, 30, 45, 60, 90}, 65 }), 66 } 67 68 func init() { 69 prometheus.MustRegister(gitMetrics.ensureFreshPrimaryDuration) 70 prometheus.MustRegister(gitMetrics.fetchByShaDuration) 71 prometheus.MustRegister(gitMetrics.secondaryCloneDuration) 72 prometheus.MustRegister(gitMetrics.sparseCheckoutDuration) 73 } 74 75 // ClientFactory knows how to create clientFactory for repos 76 type ClientFactory interface { 77 // ClientFromDir creates a client that operates on a repo that has already 78 // been cloned to the given directory. 79 ClientFromDir(org, repo, dir string) (RepoClient, error) 80 // ClientFor creates a client that operates on a new clone of the repo. 81 ClientFor(org, repo string) (RepoClient, error) 82 // ClientForWithRepoOpts is like ClientFor, but allows you to customize the 83 // setup of the cloned repo (such as sparse checkouts instead of using the 84 // default full clone). 85 ClientForWithRepoOpts(org, repo string, repoOpts RepoOpts) (RepoClient, error) 86 87 // Clean removes the caches used to generate clients 88 Clean() error 89 } 90 91 // RepoClient exposes interactions with a git repo 92 type RepoClient interface { 93 Publisher 94 Interactor 95 } 96 97 type repoClient struct { 98 publisher 99 interactor 100 } 101 102 type ClientFactoryOpts struct { 103 // Host, defaults to "github.com" if unset 104 Host string 105 // Whether to use HTTP. By default, HTTPS is used (overrides UseSSH). 106 // 107 // TODO (listx): Combine HTTPS, HTTP, and SSH schemes into a single enum. 108 UseInsecureHTTP *bool 109 // UseSSH, defaults to false 110 UseSSH *bool 111 // The directory in which the cache should be 112 // created. Defaults to the "/var/tmp" on 113 // Linux and os.TempDir otherwise 114 CacheDirBase *string 115 // If unset, publishing action will error 116 Username LoginGetter 117 // If unset, publishing action will error 118 Token TokenGetter 119 // The git user to use. 120 GitUser GitUserGetter 121 // The censor to use. Not needed for anonymous 122 // actions. 123 Censor Censor 124 // Path to the httpCookieFile that will be used to authenticate client 125 CookieFilePath string 126 // If set, cacheDir persist. Otherwise temp dir will be used for CacheDir 127 Persist *bool 128 } 129 130 // These options are scoped to the repo, not the ClientFactory level. The reason 131 // for the separation is to allow a single process to have for example repos 132 // that are both sparsely checked out and non-sparsely checked out. 133 type RepoOpts struct { 134 // sparseCheckoutDirs is the list of directories that the working tree 135 // should have. If non-nil and empty, then the working tree only has files 136 // reachable from the root. If non-nil and non-empty, then those additional 137 // directories from the root are also checked out (populated) in the working 138 // tree, recursively. 139 SparseCheckoutDirs []string 140 // This is the `--share` flag to `git clone`. For cloning from a local 141 // source, it allows bypassing the copying of all objects. If this is true, 142 // you must also set NeededCommits to a non-empty value; otherwise, when the 143 // primary is updated with RemoteUpdate() the `--prune` flag may end up 144 // deleting objects in the primary (which could adversely affect the 145 // secondary). 146 ShareObjectsWithPrimaryClone bool 147 // NeededCommits list only those commit SHAs which are needed. If the commit 148 // already exists, it is not fetched to save network costs. If NeededCommits 149 // is set, we do not call RemoteUpdate() for the primary clone (git cache). 150 NeededCommits sets.Set[string] 151 // BranchesToRetarget contains a map of branch names mapped to SHAs. These 152 // branch name and SHA pairs will be fed into RetargetBranch in the git v2 153 // client, to update the current HEAD of each branch. 154 BranchesToRetarget map[string]string 155 } 156 157 // Apply allows to use a ClientFactoryOpts as Opt 158 func (cfo *ClientFactoryOpts) Apply(target *ClientFactoryOpts) { 159 if cfo.Host != "" { 160 target.Host = cfo.Host 161 } 162 if cfo.UseInsecureHTTP != nil { 163 target.UseInsecureHTTP = cfo.UseInsecureHTTP 164 } 165 if cfo.UseSSH != nil { 166 target.UseSSH = cfo.UseSSH 167 } 168 if cfo.CacheDirBase != nil { 169 target.CacheDirBase = cfo.CacheDirBase 170 } 171 if cfo.Token != nil { 172 target.Token = cfo.Token 173 } 174 if cfo.GitUser != nil { 175 target.GitUser = cfo.GitUser 176 } 177 if cfo.Censor != nil { 178 target.Censor = cfo.Censor 179 } 180 if cfo.Username != nil { 181 target.Username = cfo.Username 182 } 183 if cfo.CookieFilePath != "" { 184 target.CookieFilePath = cfo.CookieFilePath 185 } 186 if cfo.Persist != nil { 187 target.Persist = cfo.Persist 188 } 189 } 190 191 func defaultTempDir() *string { 192 switch runtime.GOOS { 193 case "linux": 194 return utilpointer.String("/var/tmp") 195 default: 196 return utilpointer.String("") 197 } 198 } 199 200 // ClientFactoryOpts allows to manipulate the options for a ClientFactory 201 type ClientFactoryOpt func(*ClientFactoryOpts) 202 203 func defaultClientFactoryOpts(cfo *ClientFactoryOpts) { 204 if cfo.Host == "" { 205 cfo.Host = "github.com" 206 } 207 if cfo.CacheDirBase == nil { 208 // If we do not have a place to put cache, put it in temp dir. 209 cfo.CacheDirBase = defaultTempDir() 210 } 211 if cfo.Censor == nil { 212 cfo.Censor = func(in []byte) []byte { return in } 213 } 214 } 215 216 // NewClientFactory allows for the creation of repository clients. It uses github.com 217 // without authentication by default, if UseSSH then returns 218 // sshRemoteResolverFactory, and if CookieFilePath is provided then returns 219 // gerritResolverFactory(Assuming that git http.cookiefile is used only by 220 // Gerrit, this function needs to be updated if it turned out that this 221 // assumtpion is not correct.) 222 func NewClientFactory(opts ...ClientFactoryOpt) (ClientFactory, error) { 223 o := ClientFactoryOpts{} 224 defaultClientFactoryOpts(&o) 225 for _, opt := range opts { 226 opt(&o) 227 } 228 229 if o.CookieFilePath != "" { 230 if output, err := exec.Command("git", "config", "--global", "http.cookiefile", o.CookieFilePath).CombinedOutput(); err != nil { 231 return nil, fmt.Errorf("unable to configure http.cookiefile.\nOutput: %s\nError: %w", string(output), err) 232 } 233 } 234 235 var cacheDir string 236 var err error 237 // If we want to persist the Cache between runs, use the cacheDirBase as the cache. Otherwise make a temp dir. 238 if o.Persist != nil && *o.Persist { 239 cacheDir = *o.CacheDirBase 240 } else if cacheDir, err = os.MkdirTemp(*o.CacheDirBase, "gitcache"); err != nil { 241 return nil, err 242 } 243 244 var remote RemoteResolverFactory 245 if o.UseSSH != nil && *o.UseSSH { 246 remote = &sshRemoteResolverFactory{ 247 host: o.Host, 248 username: o.Username, 249 } 250 } else if o.CookieFilePath != "" { 251 remote = &gerritResolverFactory{} 252 } else { 253 remote = &httpResolverFactory{ 254 host: o.Host, 255 http: o.UseInsecureHTTP != nil && *o.UseInsecureHTTP, 256 username: o.Username, 257 token: o.Token, 258 } 259 } 260 return &clientFactory{ 261 cacheDir: cacheDir, 262 cacheDirBase: *o.CacheDirBase, 263 remote: remote, 264 gitUser: o.GitUser, 265 censor: o.Censor, 266 masterLock: &sync.Mutex{}, 267 repoLocks: map[string]*sync.Mutex{}, 268 logger: logrus.WithField("client", "git"), 269 cookieFilePath: o.CookieFilePath, 270 }, nil 271 } 272 273 // NewLocalClientFactory allows for the creation of repository clients 274 // based on a local filepath remote for testing 275 func NewLocalClientFactory(baseDir string, gitUser GitUserGetter, censor Censor) (ClientFactory, error) { 276 cacheDir, err := os.MkdirTemp("", "gitcache") 277 if err != nil { 278 return nil, err 279 } 280 return &clientFactory{ 281 cacheDir: cacheDir, 282 remote: &pathResolverFactory{baseDir: baseDir}, 283 gitUser: gitUser, 284 censor: censor, 285 masterLock: &sync.Mutex{}, 286 repoLocks: map[string]*sync.Mutex{}, 287 logger: logrus.WithField("client", "git"), 288 }, nil 289 } 290 291 type clientFactory struct { 292 remote RemoteResolverFactory 293 gitUser GitUserGetter 294 censor Censor 295 logger *logrus.Entry 296 cookieFilePath string 297 298 // cacheDir is the root under which cached clones of repos are created 299 cacheDir string 300 // cacheDirBase is the basedir under which create tempdirs 301 cacheDirBase string 302 // masterLock guards mutations to the repoLocks records 303 masterLock *sync.Mutex 304 // repoLocks guard mutating access to subdirectories under the cacheDir 305 repoLocks map[string]*sync.Mutex 306 } 307 308 // bootstrapClients returns a repository client and cloner for a dir. 309 func (c *clientFactory) bootstrapClients(org, repo, dir string) (cacher, cloner, RepoClient, error) { 310 if dir == "" { 311 workdir, err := os.Getwd() 312 if err != nil { 313 return nil, nil, nil, err 314 } 315 dir = workdir 316 } 317 logger := c.logger.WithFields(logrus.Fields{"org": org, "repo": repo}) 318 logger.WithField("dir", dir).Debug("Creating a pre-initialized client.") 319 executor, err := NewCensoringExecutor(dir, c.censor, logger) 320 if err != nil { 321 return nil, nil, nil, err 322 } 323 client := &repoClient{ 324 publisher: publisher{ 325 remotes: remotes{ 326 publishRemote: c.remote.PublishRemote(org, repo), 327 centralRemote: c.remote.CentralRemote(org, repo), 328 }, 329 executor: executor, 330 info: c.gitUser, 331 logger: logger, 332 }, 333 interactor: interactor{ 334 dir: dir, 335 remote: c.remote.CentralRemote(org, repo), 336 executor: executor, 337 logger: logger, 338 }, 339 } 340 return client, client, client, nil 341 } 342 343 // ClientFromDir returns a repository client for a directory that's already initialized with content. 344 // If the directory isn't specified, the current working directory is used. 345 func (c *clientFactory) ClientFromDir(org, repo, dir string) (RepoClient, error) { 346 _, _, client, err := c.bootstrapClients(org, repo, dir) 347 return client, err 348 } 349 350 // ClientFor wraps around ClientForWithRepoOpts using the default RepoOpts{} 351 // (empty value). Originally, ClientFor was not a wrapper at all and did the 352 // work inside ClientForWithRepoOpts itself, but it did this without RepoOpts. 353 // When RepoOpts was created, we made ClientFor wrap around 354 // ClientForWithRepoOpts to preserve behavior of existing callers of ClientFor. 355 func (c *clientFactory) ClientFor(org, repo string) (RepoClient, error) { 356 return c.ClientForWithRepoOpts(org, repo, RepoOpts{}) 357 } 358 359 // ClientForWithRepoOpts returns a repository client for the specified repository. 360 // This function may take a long time if it is the first time cloning the repo. 361 // In that case, it must do a full git mirror clone. For large repos, this can 362 // take a while. Once that is done, it will do a git remote update (essentially 363 // git fetch) for the mirror clone, which will usually take at most a few 364 // seconds, before creating a secondary clone from this (updated) mirror. 365 // 366 // org and repo are used for determining where the repo is cloned, cloneURI 367 // overrides org/repo for cloning. 368 func (c *clientFactory) ClientForWithRepoOpts(org, repo string, repoOpts RepoOpts) (RepoClient, error) { 369 if repoOpts.ShareObjectsWithPrimaryClone && repoOpts.NeededCommits.Len() == 0 { 370 return nil, fmt.Errorf("programmer error: cannot share objects between primary and secondary without targeted fetches (NeededCommits)") 371 } 372 373 cacheDir := path.Join(c.cacheDir, org, repo) 374 c.logger.WithFields(logrus.Fields{"org": org, "repo": repo, "dir": cacheDir}).Debug("Creating a client from the cache.") 375 cacheClientCacher, _, _, err := c.bootstrapClients(org, repo, cacheDir) 376 if err != nil { 377 return nil, err 378 } 379 380 // Put copies of the repo in temp dir. 381 repoDir, err := os.MkdirTemp(*defaultTempDir(), "gitrepo") 382 if err != nil { 383 return nil, err 384 } 385 _, repoClientCloner, repoClient, err := c.bootstrapClients(org, repo, repoDir) 386 if err != nil { 387 return nil, err 388 } 389 390 // First create or update the primary clone (in "cacheDir"). 391 timeBeforeEnsureFreshPrimary := time.Now() 392 err = c.ensureFreshPrimary(cacheDir, cacheClientCacher, repoOpts, org, repo) 393 if err != nil { 394 c.logger.WithFields(logrus.Fields{"org": org, "repo": repo, "dir": cacheDir}).Errorf("Error encountered while refreshing primary clone: %s", err.Error()) 395 } else { 396 gitMetrics.ensureFreshPrimaryDuration.WithLabelValues(org, repo).Observe(time.Since(timeBeforeEnsureFreshPrimary).Seconds()) 397 } 398 399 // Initialize the new derivative repo (secondary clone) from the primary 400 // clone. This is a local clone operation. 401 timeBeforeSecondaryClone := time.Now() 402 if err = repoClientCloner.CloneWithRepoOpts(cacheDir, repoOpts); err != nil { 403 return nil, err 404 } 405 gitMetrics.secondaryCloneDuration.WithLabelValues(org, repo).Observe(time.Since(timeBeforeSecondaryClone).Seconds()) 406 407 return repoClient, nil 408 } 409 410 func (c *clientFactory) ensureFreshPrimary( 411 cacheDir string, 412 cacheClientCacher cacher, 413 repoOpts RepoOpts, 414 org string, 415 repo string, 416 ) error { 417 if err := c.maybeCloneAndUpdatePrimary(cacheDir, cacheClientCacher, repoOpts); err != nil { 418 return err 419 } 420 // For targeted fetches by SHA objects, there's no need to hold a lock on 421 // the primary because it's safe to do so (git will first write to a 422 // temporary file and replace the file being written to, so if another git 423 // process already wrote to it, the worst case is that it will overwrite the 424 // file with the same data). Targeted fetch. Only fetch those commits which 425 // we want, and only if they are missing. 426 if repoOpts.NeededCommits.Len() > 0 { 427 // Targeted fetch. Only fetch those commits which we want, and only if 428 // they are missing. 429 timeBeforeFetchBySha := time.Now() 430 if err := cacheClientCacher.FetchCommits(repoOpts.NeededCommits.UnsortedList()); err != nil { 431 return err 432 } 433 gitMetrics.fetchByShaDuration.WithLabelValues(org, repo).Observe(time.Since(timeBeforeFetchBySha).Seconds()) 434 435 // Retarget branches. That is, make them point to a new SHA, so that the 436 // branches can get updated, even though we only fetch by SHA above. 437 // 438 // Because the branches never get used directly here, it's OK if this 439 // operation fails. 440 for branch, sha := range repoOpts.BranchesToRetarget { 441 if err := cacheClientCacher.RetargetBranch(branch, sha); err != nil { 442 c.logger.WithFields(logrus.Fields{"org": org, "repo": repo, "dir": cacheDir, "branch": branch}).WithError(err).Debug("failed to retarget branch") 443 } 444 } 445 } 446 447 return nil 448 } 449 450 // maybeCloneAndUpdatePrimary clones the primary if it doesn't exist yet, and 451 // also runs a RemoteUpdate() against it if NeededCommits is empty. The 452 // operations in this function are protected by a lock so that only one thread 453 // can run at a given time for the same cacheDir (primary clone path). 454 func (c *clientFactory) maybeCloneAndUpdatePrimary(cacheDir string, cacheClientCacher cacher, repoOpts RepoOpts) error { 455 // Protect access to the shared repoLocks map. The main point of all this 456 // locking is to ensure that we only try to create the primary clone (if it 457 // doesn't exist) in a serial manner. 458 var repoLock *sync.Mutex 459 c.masterLock.Lock() 460 if _, exists := c.repoLocks[cacheDir]; exists { 461 repoLock = c.repoLocks[cacheDir] 462 } else { 463 repoLock = &sync.Mutex{} 464 c.repoLocks[cacheDir] = repoLock 465 } 466 c.masterLock.Unlock() 467 468 repoLock.Lock() 469 defer repoLock.Unlock() 470 if _, err := os.Stat(path.Join(cacheDir, "HEAD")); os.IsNotExist(err) { 471 // we have not yet cloned this repo, we need to do a full clone 472 if err := os.MkdirAll(cacheDir, os.ModePerm); err != nil && !os.IsExist(err) { 473 return err 474 } 475 if err := cacheClientCacher.MirrorClone(); err != nil { 476 return err 477 } 478 } else if err != nil { 479 // something unexpected happened 480 return err 481 } else if repoOpts.NeededCommits.Len() == 0 { 482 // We have cloned the repo previously, but will refresh it. By default 483 // we refresh all refs with a call to `git remote update`. 484 // 485 // This is the default behavior if NeededCommits is empty or nil (i.e., 486 // when we don't define a targeted list of commits to fetch directly). 487 // 488 // This call to RemoteUpdate() still needs to be protected by a lock 489 // because it updates possibly hundreds, if not thousands, of refs 490 // (quite literally, files in .git/refs/*). 491 if err := cacheClientCacher.RemoteUpdate(); err != nil { 492 return err 493 } 494 } 495 496 return nil 497 } 498 499 // Clean removes the caches used to generate clients 500 func (c *clientFactory) Clean() error { 501 return os.RemoveAll(c.cacheDir) 502 }