code.gitea.io/gitea@v1.19.3/modules/indexer/code/elastic_search.go (about)

     1  // Copyright 2020 The Gitea Authors. All rights reserved.
     2  // SPDX-License-Identifier: MIT
     3  
     4  package code
     5  
     6  import (
     7  	"bufio"
     8  	"context"
     9  	"errors"
    10  	"fmt"
    11  	"io"
    12  	"net"
    13  	"strconv"
    14  	"strings"
    15  	"sync"
    16  	"time"
    17  
    18  	repo_model "code.gitea.io/gitea/models/repo"
    19  	"code.gitea.io/gitea/modules/analyze"
    20  	"code.gitea.io/gitea/modules/charset"
    21  	"code.gitea.io/gitea/modules/git"
    22  	"code.gitea.io/gitea/modules/graceful"
    23  	"code.gitea.io/gitea/modules/json"
    24  	"code.gitea.io/gitea/modules/log"
    25  	"code.gitea.io/gitea/modules/setting"
    26  	"code.gitea.io/gitea/modules/timeutil"
    27  	"code.gitea.io/gitea/modules/typesniffer"
    28  
    29  	"github.com/go-enry/go-enry/v2"
    30  	"github.com/olivere/elastic/v7"
    31  )
    32  
    33  const (
    34  	esRepoIndexerLatestVersion = 1
    35  	// multi-match-types, currently only 2 types are used
    36  	// Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types
    37  	esMultiMatchTypeBestFields   = "best_fields"
    38  	esMultiMatchTypePhrasePrefix = "phrase_prefix"
    39  )
    40  
    41  var _ Indexer = &ElasticSearchIndexer{}
    42  
    43  // ElasticSearchIndexer implements Indexer interface
    44  type ElasticSearchIndexer struct {
    45  	client               *elastic.Client
    46  	indexerAliasName     string
    47  	available            bool
    48  	availabilityCallback func(bool)
    49  	stopTimer            chan struct{}
    50  	lock                 sync.RWMutex
    51  }
    52  
    53  type elasticLogger struct {
    54  	log.Logger
    55  }
    56  
    57  func (l elasticLogger) Printf(format string, args ...interface{}) {
    58  	_ = l.Logger.Log(2, l.Logger.GetLevel(), format, args...)
    59  }
    60  
    61  // NewElasticSearchIndexer creates a new elasticsearch indexer
    62  func NewElasticSearchIndexer(url, indexerName string) (*ElasticSearchIndexer, bool, error) {
    63  	opts := []elastic.ClientOptionFunc{
    64  		elastic.SetURL(url),
    65  		elastic.SetSniff(false),
    66  		elastic.SetHealthcheckInterval(10 * time.Second),
    67  		elastic.SetGzip(false),
    68  	}
    69  
    70  	logger := elasticLogger{log.GetLogger(log.DEFAULT)}
    71  
    72  	if logger.GetLevel() == log.TRACE || logger.GetLevel() == log.DEBUG {
    73  		opts = append(opts, elastic.SetTraceLog(logger))
    74  	} else if logger.GetLevel() == log.ERROR || logger.GetLevel() == log.CRITICAL || logger.GetLevel() == log.FATAL {
    75  		opts = append(opts, elastic.SetErrorLog(logger))
    76  	} else if logger.GetLevel() == log.INFO || logger.GetLevel() == log.WARN {
    77  		opts = append(opts, elastic.SetInfoLog(logger))
    78  	}
    79  
    80  	client, err := elastic.NewClient(opts...)
    81  	if err != nil {
    82  		return nil, false, err
    83  	}
    84  
    85  	indexer := &ElasticSearchIndexer{
    86  		client:           client,
    87  		indexerAliasName: indexerName,
    88  		available:        true,
    89  		stopTimer:        make(chan struct{}),
    90  	}
    91  
    92  	ticker := time.NewTicker(10 * time.Second)
    93  	go func() {
    94  		for {
    95  			select {
    96  			case <-ticker.C:
    97  				indexer.checkAvailability()
    98  			case <-indexer.stopTimer:
    99  				ticker.Stop()
   100  				return
   101  			}
   102  		}
   103  	}()
   104  
   105  	exists, err := indexer.init()
   106  	if err != nil {
   107  		indexer.Close()
   108  		return nil, false, err
   109  	}
   110  	return indexer, !exists, err
   111  }
   112  
   113  const (
   114  	defaultMapping = `{
   115  		"mappings": {
   116  			"properties": {
   117  				"repo_id": {
   118  					"type": "long",
   119  					"index": true
   120  				},
   121  				"content": {
   122  					"type": "text",
   123  					"term_vector": "with_positions_offsets",
   124  					"index": true
   125  				},
   126  				"commit_id": {
   127  					"type": "keyword",
   128  					"index": true
   129  				},
   130  				"language": {
   131  					"type": "keyword",
   132  					"index": true
   133  				},
   134  				"updated_at": {
   135  					"type": "long",
   136  					"index": true
   137  				}
   138  			}
   139  		}
   140  	}`
   141  )
   142  
   143  func (b *ElasticSearchIndexer) realIndexerName() string {
   144  	return fmt.Sprintf("%s.v%d", b.indexerAliasName, esRepoIndexerLatestVersion)
   145  }
   146  
   147  // Init will initialize the indexer
   148  func (b *ElasticSearchIndexer) init() (bool, error) {
   149  	ctx := graceful.GetManager().HammerContext()
   150  	exists, err := b.client.IndexExists(b.realIndexerName()).Do(ctx)
   151  	if err != nil {
   152  		return false, b.checkError(err)
   153  	}
   154  	if !exists {
   155  		mapping := defaultMapping
   156  
   157  		createIndex, err := b.client.CreateIndex(b.realIndexerName()).BodyString(mapping).Do(ctx)
   158  		if err != nil {
   159  			return false, b.checkError(err)
   160  		}
   161  		if !createIndex.Acknowledged {
   162  			return false, fmt.Errorf("create index %s with %s failed", b.realIndexerName(), mapping)
   163  		}
   164  	}
   165  
   166  	// check version
   167  	r, err := b.client.Aliases().Do(ctx)
   168  	if err != nil {
   169  		return false, b.checkError(err)
   170  	}
   171  
   172  	realIndexerNames := r.IndicesByAlias(b.indexerAliasName)
   173  	if len(realIndexerNames) < 1 {
   174  		res, err := b.client.Alias().
   175  			Add(b.realIndexerName(), b.indexerAliasName).
   176  			Do(ctx)
   177  		if err != nil {
   178  			return false, b.checkError(err)
   179  		}
   180  		if !res.Acknowledged {
   181  			return false, fmt.Errorf("create alias %s to index %s failed", b.indexerAliasName, b.realIndexerName())
   182  		}
   183  	} else if len(realIndexerNames) >= 1 && realIndexerNames[0] < b.realIndexerName() {
   184  		log.Warn("Found older gitea indexer named %s, but we will create a new one %s and keep the old NOT DELETED. You can delete the old version after the upgrade succeed.",
   185  			realIndexerNames[0], b.realIndexerName())
   186  		res, err := b.client.Alias().
   187  			Remove(realIndexerNames[0], b.indexerAliasName).
   188  			Add(b.realIndexerName(), b.indexerAliasName).
   189  			Do(ctx)
   190  		if err != nil {
   191  			return false, b.checkError(err)
   192  		}
   193  		if !res.Acknowledged {
   194  			return false, fmt.Errorf("change alias %s to index %s failed", b.indexerAliasName, b.realIndexerName())
   195  		}
   196  	}
   197  
   198  	return exists, nil
   199  }
   200  
   201  // SetAvailabilityChangeCallback sets callback that will be triggered when availability changes
   202  func (b *ElasticSearchIndexer) SetAvailabilityChangeCallback(callback func(bool)) {
   203  	b.lock.Lock()
   204  	defer b.lock.Unlock()
   205  	b.availabilityCallback = callback
   206  }
   207  
   208  // Ping checks if elastic is available
   209  func (b *ElasticSearchIndexer) Ping() bool {
   210  	b.lock.RLock()
   211  	defer b.lock.RUnlock()
   212  	return b.available
   213  }
   214  
   215  func (b *ElasticSearchIndexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserError, batchReader *bufio.Reader, sha string, update fileUpdate, repo *repo_model.Repository) ([]elastic.BulkableRequest, error) {
   216  	// Ignore vendored files in code search
   217  	if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) {
   218  		return nil, nil
   219  	}
   220  
   221  	size := update.Size
   222  	var err error
   223  	if !update.Sized {
   224  		var stdout string
   225  		stdout, _, err = git.NewCommand(ctx, "cat-file", "-s").AddDynamicArguments(update.BlobSha).RunStdString(&git.RunOpts{Dir: repo.RepoPath()})
   226  		if err != nil {
   227  			return nil, err
   228  		}
   229  		if size, err = strconv.ParseInt(strings.TrimSpace(stdout), 10, 64); err != nil {
   230  			return nil, fmt.Errorf("misformatted git cat-file output: %w", err)
   231  		}
   232  	}
   233  
   234  	if size > setting.Indexer.MaxIndexerFileSize {
   235  		return []elastic.BulkableRequest{b.addDelete(update.Filename, repo)}, nil
   236  	}
   237  
   238  	if _, err := batchWriter.Write([]byte(update.BlobSha + "\n")); err != nil {
   239  		return nil, err
   240  	}
   241  
   242  	_, _, size, err = git.ReadBatchLine(batchReader)
   243  	if err != nil {
   244  		return nil, err
   245  	}
   246  
   247  	fileContents, err := io.ReadAll(io.LimitReader(batchReader, size))
   248  	if err != nil {
   249  		return nil, err
   250  	} else if !typesniffer.DetectContentType(fileContents).IsText() {
   251  		// FIXME: UTF-16 files will probably fail here
   252  		return nil, nil
   253  	}
   254  
   255  	if _, err = batchReader.Discard(1); err != nil {
   256  		return nil, err
   257  	}
   258  	id := filenameIndexerID(repo.ID, update.Filename)
   259  
   260  	return []elastic.BulkableRequest{
   261  		elastic.NewBulkIndexRequest().
   262  			Index(b.indexerAliasName).
   263  			Id(id).
   264  			Doc(map[string]interface{}{
   265  				"repo_id":    repo.ID,
   266  				"content":    string(charset.ToUTF8DropErrors(fileContents)),
   267  				"commit_id":  sha,
   268  				"language":   analyze.GetCodeLanguage(update.Filename, fileContents),
   269  				"updated_at": timeutil.TimeStampNow(),
   270  			}),
   271  	}, nil
   272  }
   273  
   274  func (b *ElasticSearchIndexer) addDelete(filename string, repo *repo_model.Repository) elastic.BulkableRequest {
   275  	id := filenameIndexerID(repo.ID, filename)
   276  	return elastic.NewBulkDeleteRequest().
   277  		Index(b.indexerAliasName).
   278  		Id(id)
   279  }
   280  
   281  // Index will save the index data
   282  func (b *ElasticSearchIndexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *repoChanges) error {
   283  	reqs := make([]elastic.BulkableRequest, 0)
   284  	if len(changes.Updates) > 0 {
   285  		// Now because of some insanity with git cat-file not immediately failing if not run in a valid git directory we need to run git rev-parse first!
   286  		if err := git.EnsureValidGitRepository(ctx, repo.RepoPath()); err != nil {
   287  			log.Error("Unable to open git repo: %s for %-v: %v", repo.RepoPath(), repo, err)
   288  			return err
   289  		}
   290  
   291  		batchWriter, batchReader, cancel := git.CatFileBatch(ctx, repo.RepoPath())
   292  		defer cancel()
   293  
   294  		for _, update := range changes.Updates {
   295  			updateReqs, err := b.addUpdate(ctx, batchWriter, batchReader, sha, update, repo)
   296  			if err != nil {
   297  				return err
   298  			}
   299  			if len(updateReqs) > 0 {
   300  				reqs = append(reqs, updateReqs...)
   301  			}
   302  		}
   303  		cancel()
   304  	}
   305  
   306  	for _, filename := range changes.RemovedFilenames {
   307  		reqs = append(reqs, b.addDelete(filename, repo))
   308  	}
   309  
   310  	if len(reqs) > 0 {
   311  		_, err := b.client.Bulk().
   312  			Index(b.indexerAliasName).
   313  			Add(reqs...).
   314  			Do(ctx)
   315  		return b.checkError(err)
   316  	}
   317  	return nil
   318  }
   319  
   320  // Delete deletes indexes by ids
   321  func (b *ElasticSearchIndexer) Delete(repoID int64) error {
   322  	_, err := b.client.DeleteByQuery(b.indexerAliasName).
   323  		Query(elastic.NewTermsQuery("repo_id", repoID)).
   324  		Do(graceful.GetManager().HammerContext())
   325  	return b.checkError(err)
   326  }
   327  
   328  // indexPos find words positions for start and the following end on content. It will
   329  // return the beginning position of the first start and the ending position of the
   330  // first end following the start string.
   331  // If not found any of the positions, it will return -1, -1.
   332  func indexPos(content, start, end string) (int, int) {
   333  	startIdx := strings.Index(content, start)
   334  	if startIdx < 0 {
   335  		return -1, -1
   336  	}
   337  	endIdx := strings.Index(content[startIdx+len(start):], end)
   338  	if endIdx < 0 {
   339  		return -1, -1
   340  	}
   341  	return startIdx, startIdx + len(start) + endIdx + len(end)
   342  }
   343  
   344  func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) (int64, []*SearchResult, []*SearchResultLanguages, error) {
   345  	hits := make([]*SearchResult, 0, pageSize)
   346  	for _, hit := range searchResult.Hits.Hits {
   347  		// FIXME: There is no way to get the position the keyword on the content currently on the same request.
   348  		// So we get it from content, this may made the query slower. See
   349  		// https://discuss.elastic.co/t/fetching-position-of-keyword-in-matched-document/94291
   350  		var startIndex, endIndex int
   351  		c, ok := hit.Highlight["content"]
   352  		if ok && len(c) > 0 {
   353  			// FIXME: Since the highlighting content will include <em> and </em> for the keywords,
   354  			// now we should find the positions. But how to avoid html content which contains the
   355  			// <em> and </em> tags? If elastic search has handled that?
   356  			startIndex, endIndex = indexPos(c[0], "<em>", "</em>")
   357  			if startIndex == -1 {
   358  				panic(fmt.Sprintf("1===%s,,,%#v,,,%s", kw, hit.Highlight, c[0]))
   359  			}
   360  		} else {
   361  			panic(fmt.Sprintf("2===%#v", hit.Highlight))
   362  		}
   363  
   364  		repoID, fileName := parseIndexerID(hit.Id)
   365  		res := make(map[string]interface{})
   366  		if err := json.Unmarshal(hit.Source, &res); err != nil {
   367  			return 0, nil, nil, err
   368  		}
   369  
   370  		language := res["language"].(string)
   371  
   372  		hits = append(hits, &SearchResult{
   373  			RepoID:      repoID,
   374  			Filename:    fileName,
   375  			CommitID:    res["commit_id"].(string),
   376  			Content:     res["content"].(string),
   377  			UpdatedUnix: timeutil.TimeStamp(res["updated_at"].(float64)),
   378  			Language:    language,
   379  			StartIndex:  startIndex,
   380  			EndIndex:    endIndex - 9, // remove the length <em></em> since we give Content the original data
   381  			Color:       enry.GetColor(language),
   382  		})
   383  	}
   384  
   385  	return searchResult.TotalHits(), hits, extractAggs(searchResult), nil
   386  }
   387  
   388  func extractAggs(searchResult *elastic.SearchResult) []*SearchResultLanguages {
   389  	var searchResultLanguages []*SearchResultLanguages
   390  	agg, found := searchResult.Aggregations.Terms("language")
   391  	if found {
   392  		searchResultLanguages = make([]*SearchResultLanguages, 0, 10)
   393  
   394  		for _, bucket := range agg.Buckets {
   395  			searchResultLanguages = append(searchResultLanguages, &SearchResultLanguages{
   396  				Language: bucket.Key.(string),
   397  				Color:    enry.GetColor(bucket.Key.(string)),
   398  				Count:    int(bucket.DocCount),
   399  			})
   400  		}
   401  	}
   402  	return searchResultLanguages
   403  }
   404  
   405  // Search searches for codes and language stats by given conditions.
   406  func (b *ElasticSearchIndexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*SearchResult, []*SearchResultLanguages, error) {
   407  	searchType := esMultiMatchTypeBestFields
   408  	if isMatch {
   409  		searchType = esMultiMatchTypePhrasePrefix
   410  	}
   411  
   412  	kwQuery := elastic.NewMultiMatchQuery(keyword, "content").Type(searchType)
   413  	query := elastic.NewBoolQuery()
   414  	query = query.Must(kwQuery)
   415  	if len(repoIDs) > 0 {
   416  		repoStrs := make([]interface{}, 0, len(repoIDs))
   417  		for _, repoID := range repoIDs {
   418  			repoStrs = append(repoStrs, repoID)
   419  		}
   420  		repoQuery := elastic.NewTermsQuery("repo_id", repoStrs...)
   421  		query = query.Must(repoQuery)
   422  	}
   423  
   424  	var (
   425  		start       int
   426  		kw          = "<em>" + keyword + "</em>"
   427  		aggregation = elastic.NewTermsAggregation().Field("language").Size(10).OrderByCountDesc()
   428  	)
   429  
   430  	if page > 0 {
   431  		start = (page - 1) * pageSize
   432  	}
   433  
   434  	if len(language) == 0 {
   435  		searchResult, err := b.client.Search().
   436  			Index(b.indexerAliasName).
   437  			Aggregation("language", aggregation).
   438  			Query(query).
   439  			Highlight(
   440  				elastic.NewHighlight().
   441  					Field("content").
   442  					NumOfFragments(0). // return all highting content on fragments
   443  					HighlighterType("fvh"),
   444  			).
   445  			Sort("repo_id", true).
   446  			From(start).Size(pageSize).
   447  			Do(ctx)
   448  		if err != nil {
   449  			return 0, nil, nil, b.checkError(err)
   450  		}
   451  
   452  		return convertResult(searchResult, kw, pageSize)
   453  	}
   454  
   455  	langQuery := elastic.NewMatchQuery("language", language)
   456  	countResult, err := b.client.Search().
   457  		Index(b.indexerAliasName).
   458  		Aggregation("language", aggregation).
   459  		Query(query).
   460  		Size(0). // We only needs stats information
   461  		Do(ctx)
   462  	if err != nil {
   463  		return 0, nil, nil, b.checkError(err)
   464  	}
   465  
   466  	query = query.Must(langQuery)
   467  	searchResult, err := b.client.Search().
   468  		Index(b.indexerAliasName).
   469  		Query(query).
   470  		Highlight(
   471  			elastic.NewHighlight().
   472  				Field("content").
   473  				NumOfFragments(0). // return all highting content on fragments
   474  				HighlighterType("fvh"),
   475  		).
   476  		Sort("repo_id", true).
   477  		From(start).Size(pageSize).
   478  		Do(ctx)
   479  	if err != nil {
   480  		return 0, nil, nil, b.checkError(err)
   481  	}
   482  
   483  	total, hits, _, err := convertResult(searchResult, kw, pageSize)
   484  
   485  	return total, hits, extractAggs(countResult), err
   486  }
   487  
   488  // Close implements indexer
   489  func (b *ElasticSearchIndexer) Close() {
   490  	select {
   491  	case <-b.stopTimer:
   492  	default:
   493  		close(b.stopTimer)
   494  	}
   495  }
   496  
   497  func (b *ElasticSearchIndexer) checkError(err error) error {
   498  	var opErr *net.OpError
   499  	if !(elastic.IsConnErr(err) || (errors.As(err, &opErr) && (opErr.Op == "dial" || opErr.Op == "read"))) {
   500  		return err
   501  	}
   502  
   503  	b.setAvailability(false)
   504  
   505  	return err
   506  }
   507  
   508  func (b *ElasticSearchIndexer) checkAvailability() {
   509  	if b.Ping() {
   510  		return
   511  	}
   512  
   513  	// Request cluster state to check if elastic is available again
   514  	_, err := b.client.ClusterState().Do(graceful.GetManager().ShutdownContext())
   515  	if err != nil {
   516  		b.setAvailability(false)
   517  		return
   518  	}
   519  
   520  	b.setAvailability(true)
   521  }
   522  
   523  func (b *ElasticSearchIndexer) setAvailability(available bool) {
   524  	b.lock.Lock()
   525  	defer b.lock.Unlock()
   526  
   527  	if b.available == available {
   528  		return
   529  	}
   530  
   531  	b.available = available
   532  	if b.availabilityCallback != nil {
   533  		// Call the callback from within the lock to ensure that the ordering remains correct
   534  		b.availabilityCallback(b.available)
   535  	}
   536  }