code.gitea.io/gitea@v1.22.3/modules/indexer/code/elasticsearch/elasticsearch.go (about)

     1  // Copyright 2020 The Gitea Authors. All rights reserved.
     2  // SPDX-License-Identifier: MIT
     3  
     4  package elasticsearch
     5  
     6  import (
     7  	"bufio"
     8  	"context"
     9  	"fmt"
    10  	"io"
    11  	"strconv"
    12  	"strings"
    13  
    14  	repo_model "code.gitea.io/gitea/models/repo"
    15  	"code.gitea.io/gitea/modules/analyze"
    16  	"code.gitea.io/gitea/modules/charset"
    17  	"code.gitea.io/gitea/modules/git"
    18  	"code.gitea.io/gitea/modules/gitrepo"
    19  	"code.gitea.io/gitea/modules/indexer/code/internal"
    20  	indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
    21  	inner_elasticsearch "code.gitea.io/gitea/modules/indexer/internal/elasticsearch"
    22  	"code.gitea.io/gitea/modules/json"
    23  	"code.gitea.io/gitea/modules/log"
    24  	"code.gitea.io/gitea/modules/setting"
    25  	"code.gitea.io/gitea/modules/timeutil"
    26  	"code.gitea.io/gitea/modules/typesniffer"
    27  
    28  	"github.com/go-enry/go-enry/v2"
    29  	"github.com/olivere/elastic/v7"
    30  )
    31  
    32  const (
    33  	esRepoIndexerLatestVersion = 1
    34  	// multi-match-types, currently only 2 types are used
    35  	// Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types
    36  	esMultiMatchTypeBestFields   = "best_fields"
    37  	esMultiMatchTypePhrasePrefix = "phrase_prefix"
    38  )
    39  
    40  var _ internal.Indexer = &Indexer{}
    41  
    42  // Indexer implements Indexer interface
    43  type Indexer struct {
    44  	inner                    *inner_elasticsearch.Indexer
    45  	indexer_internal.Indexer // do not composite inner_elasticsearch.Indexer directly to avoid exposing too much
    46  }
    47  
    48  // NewIndexer creates a new elasticsearch indexer
    49  func NewIndexer(url, indexerName string) *Indexer {
    50  	inner := inner_elasticsearch.NewIndexer(url, indexerName, esRepoIndexerLatestVersion, defaultMapping)
    51  	indexer := &Indexer{
    52  		inner:   inner,
    53  		Indexer: inner,
    54  	}
    55  	return indexer
    56  }
    57  
    58  const (
    59  	defaultMapping = `{
    60  		"mappings": {
    61  			"properties": {
    62  				"repo_id": {
    63  					"type": "long",
    64  					"index": true
    65  				},
    66  				"content": {
    67  					"type": "text",
    68  					"term_vector": "with_positions_offsets",
    69  					"index": true
    70  				},
    71  				"commit_id": {
    72  					"type": "keyword",
    73  					"index": true
    74  				},
    75  				"language": {
    76  					"type": "keyword",
    77  					"index": true
    78  				},
    79  				"updated_at": {
    80  					"type": "long",
    81  					"index": true
    82  				}
    83  			}
    84  		}
    85  	}`
    86  )
    87  
    88  func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserError, batchReader *bufio.Reader, sha string, update internal.FileUpdate, repo *repo_model.Repository) ([]elastic.BulkableRequest, error) {
    89  	// Ignore vendored files in code search
    90  	if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) {
    91  		return nil, nil
    92  	}
    93  
    94  	size := update.Size
    95  	var err error
    96  	if !update.Sized {
    97  		var stdout string
    98  		stdout, _, err = git.NewCommand(ctx, "cat-file", "-s").AddDynamicArguments(update.BlobSha).RunStdString(&git.RunOpts{Dir: repo.RepoPath()})
    99  		if err != nil {
   100  			return nil, err
   101  		}
   102  		if size, err = strconv.ParseInt(strings.TrimSpace(stdout), 10, 64); err != nil {
   103  			return nil, fmt.Errorf("misformatted git cat-file output: %w", err)
   104  		}
   105  	}
   106  
   107  	if size > setting.Indexer.MaxIndexerFileSize {
   108  		return []elastic.BulkableRequest{b.addDelete(update.Filename, repo)}, nil
   109  	}
   110  
   111  	if _, err := batchWriter.Write([]byte(update.BlobSha + "\n")); err != nil {
   112  		return nil, err
   113  	}
   114  
   115  	_, _, size, err = git.ReadBatchLine(batchReader)
   116  	if err != nil {
   117  		return nil, err
   118  	}
   119  
   120  	fileContents, err := io.ReadAll(io.LimitReader(batchReader, size))
   121  	if err != nil {
   122  		return nil, err
   123  	} else if !typesniffer.DetectContentType(fileContents).IsText() {
   124  		// FIXME: UTF-16 files will probably fail here
   125  		return nil, nil
   126  	}
   127  
   128  	if _, err = batchReader.Discard(1); err != nil {
   129  		return nil, err
   130  	}
   131  	id := internal.FilenameIndexerID(repo.ID, update.Filename)
   132  
   133  	return []elastic.BulkableRequest{
   134  		elastic.NewBulkIndexRequest().
   135  			Index(b.inner.VersionedIndexName()).
   136  			Id(id).
   137  			Doc(map[string]any{
   138  				"repo_id":    repo.ID,
   139  				"content":    string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
   140  				"commit_id":  sha,
   141  				"language":   analyze.GetCodeLanguage(update.Filename, fileContents),
   142  				"updated_at": timeutil.TimeStampNow(),
   143  			}),
   144  	}, nil
   145  }
   146  
   147  func (b *Indexer) addDelete(filename string, repo *repo_model.Repository) elastic.BulkableRequest {
   148  	id := internal.FilenameIndexerID(repo.ID, filename)
   149  	return elastic.NewBulkDeleteRequest().
   150  		Index(b.inner.VersionedIndexName()).
   151  		Id(id)
   152  }
   153  
   154  // Index will save the index data
   155  func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *internal.RepoChanges) error {
   156  	reqs := make([]elastic.BulkableRequest, 0)
   157  	if len(changes.Updates) > 0 {
   158  		r, err := gitrepo.OpenRepository(ctx, repo)
   159  		if err != nil {
   160  			return err
   161  		}
   162  		defer r.Close()
   163  		batch, err := r.NewBatch(ctx)
   164  		if err != nil {
   165  			return err
   166  		}
   167  		defer batch.Close()
   168  
   169  		for _, update := range changes.Updates {
   170  			updateReqs, err := b.addUpdate(ctx, batch.Writer, batch.Reader, sha, update, repo)
   171  			if err != nil {
   172  				return err
   173  			}
   174  			if len(updateReqs) > 0 {
   175  				reqs = append(reqs, updateReqs...)
   176  			}
   177  		}
   178  		batch.Close()
   179  	}
   180  
   181  	for _, filename := range changes.RemovedFilenames {
   182  		reqs = append(reqs, b.addDelete(filename, repo))
   183  	}
   184  
   185  	if len(reqs) > 0 {
   186  		esBatchSize := 50
   187  
   188  		for i := 0; i < len(reqs); i += esBatchSize {
   189  			_, err := b.inner.Client.Bulk().
   190  				Index(b.inner.VersionedIndexName()).
   191  				Add(reqs[i:min(i+esBatchSize, len(reqs))]...).
   192  				Do(ctx)
   193  			if err != nil {
   194  				return err
   195  			}
   196  		}
   197  	}
   198  	return nil
   199  }
   200  
   201  // Delete entries by repoId
   202  func (b *Indexer) Delete(ctx context.Context, repoID int64) error {
   203  	if err := b.doDelete(ctx, repoID); err != nil {
   204  		// Maybe there is a conflict during the delete operation, so we should retry after a refresh
   205  		log.Warn("Deletion of entries of repo %v within index %v was erroneus. Trying to refresh index before trying again", repoID, b.inner.VersionedIndexName(), err)
   206  		if err := b.refreshIndex(ctx); err != nil {
   207  			return err
   208  		}
   209  		if err := b.doDelete(ctx, repoID); err != nil {
   210  			log.Error("Could not delete entries of repo %v within index %v", repoID, b.inner.VersionedIndexName())
   211  			return err
   212  		}
   213  	}
   214  	return nil
   215  }
   216  
   217  func (b *Indexer) refreshIndex(ctx context.Context) error {
   218  	if _, err := b.inner.Client.Refresh(b.inner.VersionedIndexName()).Do(ctx); err != nil {
   219  		log.Error("Error while trying to refresh index %v", b.inner.VersionedIndexName(), err)
   220  		return err
   221  	}
   222  
   223  	return nil
   224  }
   225  
   226  // Delete entries by repoId
   227  func (b *Indexer) doDelete(ctx context.Context, repoID int64) error {
   228  	_, err := b.inner.Client.DeleteByQuery(b.inner.VersionedIndexName()).
   229  		Query(elastic.NewTermsQuery("repo_id", repoID)).
   230  		Do(ctx)
   231  	return err
   232  }
   233  
   234  // indexPos find words positions for start and the following end on content. It will
   235  // return the beginning position of the first start and the ending position of the
   236  // first end following the start string.
   237  // If not found any of the positions, it will return -1, -1.
   238  func indexPos(content, start, end string) (int, int) {
   239  	startIdx := strings.Index(content, start)
   240  	if startIdx < 0 {
   241  		return -1, -1
   242  	}
   243  	endIdx := strings.Index(content[startIdx+len(start):], end)
   244  	if endIdx < 0 {
   245  		return -1, -1
   246  	}
   247  	return startIdx, startIdx + len(start) + endIdx + len(end)
   248  }
   249  
   250  func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
   251  	hits := make([]*internal.SearchResult, 0, pageSize)
   252  	for _, hit := range searchResult.Hits.Hits {
   253  		// FIXME: There is no way to get the position the keyword on the content currently on the same request.
   254  		// So we get it from content, this may made the query slower. See
   255  		// https://discuss.elastic.co/t/fetching-position-of-keyword-in-matched-document/94291
   256  		var startIndex, endIndex int
   257  		c, ok := hit.Highlight["content"]
   258  		if ok && len(c) > 0 {
   259  			// FIXME: Since the highlighting content will include <em> and </em> for the keywords,
   260  			// now we should find the positions. But how to avoid html content which contains the
   261  			// <em> and </em> tags? If elastic search has handled that?
   262  			startIndex, endIndex = indexPos(c[0], "<em>", "</em>")
   263  			if startIndex == -1 {
   264  				panic(fmt.Sprintf("1===%s,,,%#v,,,%s", kw, hit.Highlight, c[0]))
   265  			}
   266  		} else {
   267  			panic(fmt.Sprintf("2===%#v", hit.Highlight))
   268  		}
   269  
   270  		repoID, fileName := internal.ParseIndexerID(hit.Id)
   271  		res := make(map[string]any)
   272  		if err := json.Unmarshal(hit.Source, &res); err != nil {
   273  			return 0, nil, nil, err
   274  		}
   275  
   276  		language := res["language"].(string)
   277  
   278  		hits = append(hits, &internal.SearchResult{
   279  			RepoID:      repoID,
   280  			Filename:    fileName,
   281  			CommitID:    res["commit_id"].(string),
   282  			Content:     res["content"].(string),
   283  			UpdatedUnix: timeutil.TimeStamp(res["updated_at"].(float64)),
   284  			Language:    language,
   285  			StartIndex:  startIndex,
   286  			EndIndex:    endIndex - 9, // remove the length <em></em> since we give Content the original data
   287  			Color:       enry.GetColor(language),
   288  		})
   289  	}
   290  
   291  	return searchResult.TotalHits(), hits, extractAggs(searchResult), nil
   292  }
   293  
   294  func extractAggs(searchResult *elastic.SearchResult) []*internal.SearchResultLanguages {
   295  	var searchResultLanguages []*internal.SearchResultLanguages
   296  	agg, found := searchResult.Aggregations.Terms("language")
   297  	if found {
   298  		searchResultLanguages = make([]*internal.SearchResultLanguages, 0, 10)
   299  
   300  		for _, bucket := range agg.Buckets {
   301  			searchResultLanguages = append(searchResultLanguages, &internal.SearchResultLanguages{
   302  				Language: bucket.Key.(string),
   303  				Color:    enry.GetColor(bucket.Key.(string)),
   304  				Count:    int(bucket.DocCount),
   305  			})
   306  		}
   307  	}
   308  	return searchResultLanguages
   309  }
   310  
   311  // Search searches for codes and language stats by given conditions.
   312  func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
   313  	searchType := esMultiMatchTypePhrasePrefix
   314  	if opts.IsKeywordFuzzy {
   315  		searchType = esMultiMatchTypeBestFields
   316  	}
   317  
   318  	kwQuery := elastic.NewMultiMatchQuery(opts.Keyword, "content").Type(searchType)
   319  	query := elastic.NewBoolQuery()
   320  	query = query.Must(kwQuery)
   321  	if len(opts.RepoIDs) > 0 {
   322  		repoStrs := make([]any, 0, len(opts.RepoIDs))
   323  		for _, repoID := range opts.RepoIDs {
   324  			repoStrs = append(repoStrs, repoID)
   325  		}
   326  		repoQuery := elastic.NewTermsQuery("repo_id", repoStrs...)
   327  		query = query.Must(repoQuery)
   328  	}
   329  
   330  	var (
   331  		start, pageSize = opts.GetSkipTake()
   332  		kw              = "<em>" + opts.Keyword + "</em>"
   333  		aggregation     = elastic.NewTermsAggregation().Field("language").Size(10).OrderByCountDesc()
   334  	)
   335  
   336  	if len(opts.Language) == 0 {
   337  		searchResult, err := b.inner.Client.Search().
   338  			Index(b.inner.VersionedIndexName()).
   339  			Aggregation("language", aggregation).
   340  			Query(query).
   341  			Highlight(
   342  				elastic.NewHighlight().
   343  					Field("content").
   344  					NumOfFragments(0). // return all highting content on fragments
   345  					HighlighterType("fvh"),
   346  			).
   347  			Sort("repo_id", true).
   348  			From(start).Size(pageSize).
   349  			Do(ctx)
   350  		if err != nil {
   351  			return 0, nil, nil, err
   352  		}
   353  
   354  		return convertResult(searchResult, kw, pageSize)
   355  	}
   356  
   357  	langQuery := elastic.NewMatchQuery("language", opts.Language)
   358  	countResult, err := b.inner.Client.Search().
   359  		Index(b.inner.VersionedIndexName()).
   360  		Aggregation("language", aggregation).
   361  		Query(query).
   362  		Size(0). // We only need stats information
   363  		Do(ctx)
   364  	if err != nil {
   365  		return 0, nil, nil, err
   366  	}
   367  
   368  	query = query.Must(langQuery)
   369  	searchResult, err := b.inner.Client.Search().
   370  		Index(b.inner.VersionedIndexName()).
   371  		Query(query).
   372  		Highlight(
   373  			elastic.NewHighlight().
   374  				Field("content").
   375  				NumOfFragments(0). // return all highting content on fragments
   376  				HighlighterType("fvh"),
   377  		).
   378  		Sort("repo_id", true).
   379  		From(start).Size(pageSize).
   380  		Do(ctx)
   381  	if err != nil {
   382  		return 0, nil, nil, err
   383  	}
   384  
   385  	total, hits, _, err := convertResult(searchResult, kw, pageSize)
   386  
   387  	return total, hits, extractAggs(countResult), err
   388  }