go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/milo/internal/git/combined_logs.go (about)

     1  // Copyright 2018 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package git
    16  
    17  import (
    18  	"container/heap"
    19  	"context"
    20  	"fmt"
    21  	"sync"
    22  
    23  	"github.com/golang/protobuf/proto"
    24  
    25  	"google.golang.org/grpc/codes"
    26  	"google.golang.org/grpc/status"
    27  
    28  	gitilesapi "go.chromium.org/luci/common/api/gitiles"
    29  	"go.chromium.org/luci/common/errors"
    30  	"go.chromium.org/luci/common/logging"
    31  	gitpb "go.chromium.org/luci/common/proto/git"
    32  	"go.chromium.org/luci/common/sync/parallel"
    33  	"go.chromium.org/luci/gae/service/datastore"
    34  	"go.chromium.org/luci/milo/internal/utils"
    35  )
    36  
    37  // A structure to keep a list of commits for some ref.
    38  type refCommits struct {
    39  	commits []*gitpb.Commit
    40  }
    41  
    42  // The pop method removes and returns first commit. Second return value is true
    43  // if this was the last commit. Caller must ensure refCommits has commits when
    44  // calling the method.
    45  func (rc *refCommits) pop() (commit *gitpb.Commit, empty bool) {
    46  	commit, rc.commits = rc.commits[0], rc.commits[1:]
    47  	return commit, len(rc.commits) == 0
    48  }
    49  
    50  // We use commitHeap to merge slices of commits using max-heap algorithm below.
    51  // Only first commit in each slice is used for comparisons.
    52  type commitHeap []refCommits
    53  
    54  func (h commitHeap) Len() int {
    55  	return len(h)
    56  }
    57  
    58  func (h commitHeap) Swap(i, j int) {
    59  	h[i], h[j] = h[j], h[i]
    60  }
    61  
    62  func (h commitHeap) Less(i, j int) bool {
    63  	iTime := h[i].commits[0].Committer.Time.AsTime()
    64  	jTime := h[j].commits[0].Committer.Time.AsTime()
    65  
    66  	// Ensure consistent ordering based on commit hash when times are identical.
    67  	if iTime == jTime {
    68  		return h[i].commits[0].Id > h[j].commits[0].Id
    69  	}
    70  
    71  	// To make heap behave as max-heap, we consider later time to be smaller than
    72  	// earlier timer, i.e. latest commit will be the at the root of the heap.
    73  	return iTime.After(jTime)
    74  }
    75  
    76  func (h *commitHeap) Push(x any) {
    77  	*h = append(*h, x.(refCommits))
    78  }
    79  
    80  func (h *commitHeap) Pop() any {
    81  	old := *h
    82  	n := len(old)
    83  	x := old[n-1]
    84  	*h = old[0 : n-1]
    85  	return x
    86  }
    87  
    88  // logCache stores a cached list of commits (log) for a given ref at a given
    89  // commit position return by Gerrit. The Key describes the query that was used
    90  // to retrieve the log and follows the following format:
    91  //
    92  //	host|project|ref|exclude_ref|limit
    93  //
    94  // When the ref moves, entity is updated with the new CommitID and updated Log.
    95  // The Log field is an encoded list of commits, which is a created by encoding a
    96  // varint for the number of commits in the list followed by the corresponding
    97  // number of serialized gitpb.Commit messages.
    98  type logCache struct {
    99  	Key      string `gae:"$id"`
   100  	CommitID string `gae:"commit,noindex"`
   101  	Log      []byte `gae:"log,noindex"`
   102  
   103  	ref string `gae:"-"`
   104  }
   105  
   106  func logCacheFor(host, project, ref, excludeRef string, limit int) logCache {
   107  	return logCache{
   108  		Key: fmt.Sprintf("%s|%s|%s|%s|%d", host, project, ref, excludeRef, limit),
   109  		ref: ref,
   110  	}
   111  }
   112  
   113  func loadCacheFromDS(c context.Context, host, project, excludeRef string, limit int, refTips map[string]string) (cachedLogs map[string][]*gitpb.Commit) {
   114  	items := make([]logCache, 0, len(refTips))
   115  	for ref := range refTips {
   116  		items = append(items, logCacheFor(host, project, ref, excludeRef, limit))
   117  	}
   118  
   119  	cachedLogs = map[string][]*gitpb.Commit{}
   120  	var merr errors.MultiError
   121  	switch err := datastore.Get(c, items).(type) {
   122  	case errors.MultiError:
   123  		merr = err
   124  	case nil:
   125  		merr = nil
   126  	default:
   127  		return
   128  	}
   129  
   130  	for i, item := range items {
   131  		if (merr != nil && merr[i] != nil) || item.CommitID != refTips[item.ref] {
   132  			continue
   133  		}
   134  
   135  		buf := proto.NewBuffer(item.Log)
   136  		numCommits, err := buf.DecodeVarint()
   137  		if err != nil {
   138  			continue
   139  		}
   140  
   141  		log := make([]*gitpb.Commit, 0, numCommits)
   142  		for j := uint64(0); j < numCommits; j++ {
   143  			var commit gitpb.Commit
   144  			if err = buf.DecodeMessage(&commit); err != nil {
   145  				continue
   146  			}
   147  
   148  			log = append(log, &commit)
   149  		}
   150  
   151  		cachedLogs[item.ref] = log
   152  	}
   153  
   154  	return
   155  }
   156  
   157  func saveCacheToDS(c context.Context, host, project, excludeRef string, limit int, refLogs map[string][]*gitpb.Commit, refTips map[string]string) error {
   158  	items := make([]logCache, 0, len(refLogs))
   159  	totalBytes := 0
   160  	for ref, log := range refLogs {
   161  		buf := proto.NewBuffer([]byte{})
   162  		if err := buf.EncodeVarint(uint64(len(log))); err != nil {
   163  			return err
   164  		}
   165  
   166  		for _, commit := range log {
   167  			if err := buf.EncodeMessage(commit); err != nil {
   168  				return err
   169  			}
   170  		}
   171  
   172  		item := logCacheFor(host, project, ref, excludeRef, limit)
   173  		item.CommitID = refTips[ref]
   174  		item.Log = buf.Bytes()
   175  		items = append(items, item)
   176  
   177  		// This logic breaks storing caches into datastore into smaller requests to
   178  		// avoid exceeding 1MB limit on datastore requests set by AppEngine.
   179  		totalBytes += len(item.Log)
   180  		if totalBytes > 512*1024 { // 0.5 MiB
   181  			if err := datastore.Put(c, items); err != nil {
   182  				return err
   183  			}
   184  			totalBytes = 0
   185  			items = items[:0]
   186  		}
   187  	}
   188  
   189  	return datastore.Put(c, items)
   190  }
   191  
   192  // maxGitilesLogRPCsPerRequest is the max number of Gitiles requests allowed per
   193  // user request to avoid exceeding Gitiles quota.
   194  const maxGitilesLogRPCsPerRequest = 50
   195  
   196  func (impl *implementation) loadLogsForRefs(c context.Context, host, project, excludeRef string, limit int, refTips map[string]string) ([][]*gitpb.Commit, error) {
   197  	cachedLogs := loadCacheFromDS(c, host, project, excludeRef, limit, refTips)
   198  	logging.Infof(c, "Fetched %d logs from cache, will fetch remaining %d logs from Gitiles", len(cachedLogs), len(refTips)-len(cachedLogs))
   199  
   200  	// Load missing logs from Gitiles.
   201  	newLogs := make(map[string][]*gitpb.Commit)
   202  	lock := sync.Mutex{} // for concurrent writes to the map
   203  	err := parallel.WorkPool(8, func(ch chan<- func() error) {
   204  		numRequests := 0
   205  		for ref := range refTips {
   206  			if _, ok := cachedLogs[ref]; ok {
   207  				continue
   208  			}
   209  
   210  			if numRequests++; numRequests > maxGitilesLogRPCsPerRequest {
   211  				ch <- func() error {
   212  					// TODO(sergiyb,tandrii): if you have genuine need for this many refs
   213  					// at once, implement a cron job that runs this very function
   214  					// continuously to avoid bursts of gitiles traffic that will make Milo
   215  					// not functional for the other projects.
   216  					return errors.Reason("too many refs are new or changed to be "+
   217  						"fetched at once, stopping after %d. Check your config and/or "+
   218  						"reload the page", maxGitilesLogRPCsPerRequest).Err()
   219  				}
   220  				break
   221  			}
   222  
   223  			ref := ref
   224  			ch <- func() error {
   225  				log, err := impl.log(c, host, project, refTips[ref], excludeRef, &LogOptions{Limit: limit})
   226  				if err != nil {
   227  					return err
   228  				}
   229  
   230  				lock.Lock()
   231  				defer lock.Unlock()
   232  				newLogs[ref] = log
   233  				return nil
   234  			}
   235  		}
   236  	})
   237  
   238  	// Try to cache what we've fetched even if some requests failed.
   239  	if derr := saveCacheToDS(c, host, project, excludeRef, limit, newLogs, refTips); derr != nil {
   240  		logging.WithError(derr).Warningf(c, "Failed to cache logs fetched from Gitiles")
   241  	}
   242  
   243  	if err != nil {
   244  		return nil, errors.Annotate(err, "failed to fetch %d logs from Gitiles", len(refTips)-len(cachedLogs)-len(newLogs)).Err()
   245  	}
   246  
   247  	// Drop ref names and create a list containing all logs.
   248  	logs := make([][]*gitpb.Commit, 0, len(cachedLogs)+len(newLogs))
   249  	for _, log := range cachedLogs {
   250  		logs = append(logs, log)
   251  	}
   252  	for _, log := range newLogs {
   253  		logs = append(logs, log)
   254  	}
   255  
   256  	return logs, nil
   257  }
   258  
   259  // CombinedLogs implements Client interface.
   260  func (impl *implementation) CombinedLogs(c context.Context, host, project, excludeRef string, refs []string, limit int) (commits []*gitpb.Commit, err error) {
   261  	defer func() { err = errors.Annotate(utils.TagGRPC(c, err), "gitiles.CombinedLogs").Err() }()
   262  
   263  	// Check if the user is allowed to access this project.
   264  	allowed, err := impl.acls.IsAllowed(c, host, project)
   265  	switch {
   266  	case err != nil:
   267  		return
   268  	case !allowed:
   269  		err = status.Errorf(codes.NotFound, "not found")
   270  		return
   271  	}
   272  
   273  	// Prepare Gitiles client.
   274  	client, err := impl.gitilesClient(c, host)
   275  	if err != nil {
   276  		return
   277  	}
   278  
   279  	// Resolve all refs and commits they are pointing at.
   280  	refTips, missingRefs, err := gitilesapi.NewRefSet(refs).Resolve(c, client, project)
   281  	if err != nil {
   282  		return
   283  	}
   284  	if len(missingRefs) > 0 {
   285  		logging.Warningf(c, "configured refs %s weren't resolved to any ref; either incorrect ACLs or redudant refs", missingRefs)
   286  	}
   287  
   288  	var logs [][]*gitpb.Commit
   289  	if logs, err = impl.loadLogsForRefs(c, host, project, excludeRef, limit, refTips); err != nil {
   290  		return
   291  	}
   292  
   293  	// We merge commits from all refs sorted by time into a single list up to a
   294  	// limit. We use max-heap based merging algorithm below.
   295  	var h commitHeap
   296  	for _, log := range logs {
   297  		if len(log) > 0 {
   298  			h = append(h, refCommits{log})
   299  		}
   300  	}
   301  
   302  	// Keep adding commits to the merged list until we reach the limit or run out
   303  	// of commits on all refs.
   304  	heap.Init(&h)
   305  	commits = make([]*gitpb.Commit, 0, limit)
   306  	for len(commits) < limit && len(h) != 0 {
   307  		commit, empty := h[0].pop()
   308  		// Do not add duplicate commits that come from different refs.
   309  		if len(commits) == 0 || commits[len(commits)-1].Id != commit.Id {
   310  			commits = append(commits, commit)
   311  		}
   312  		if empty {
   313  			heap.Remove(&h, 0)
   314  		} else {
   315  			heap.Fix(&h, 0)
   316  		}
   317  	}
   318  
   319  	return
   320  }