code.gitea.io/gitea@v1.19.3/modules/indexer/code/bleve.go (about)

     1  // Copyright 2019 The Gitea Authors. All rights reserved.
     2  // SPDX-License-Identifier: MIT
     3  
     4  package code
     5  
     6  import (
     7  	"bufio"
     8  	"context"
     9  	"fmt"
    10  	"io"
    11  	"os"
    12  	"strconv"
    13  	"strings"
    14  	"time"
    15  
    16  	repo_model "code.gitea.io/gitea/models/repo"
    17  	"code.gitea.io/gitea/modules/analyze"
    18  	"code.gitea.io/gitea/modules/charset"
    19  	"code.gitea.io/gitea/modules/git"
    20  	gitea_bleve "code.gitea.io/gitea/modules/indexer/bleve"
    21  	"code.gitea.io/gitea/modules/log"
    22  	"code.gitea.io/gitea/modules/setting"
    23  	"code.gitea.io/gitea/modules/timeutil"
    24  	"code.gitea.io/gitea/modules/typesniffer"
    25  	"code.gitea.io/gitea/modules/util"
    26  
    27  	"github.com/blevesearch/bleve/v2"
    28  	analyzer_custom "github.com/blevesearch/bleve/v2/analysis/analyzer/custom"
    29  	analyzer_keyword "github.com/blevesearch/bleve/v2/analysis/analyzer/keyword"
    30  	"github.com/blevesearch/bleve/v2/analysis/token/camelcase"
    31  	"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
    32  	"github.com/blevesearch/bleve/v2/analysis/token/unicodenorm"
    33  	"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
    34  	"github.com/blevesearch/bleve/v2/index/upsidedown"
    35  	"github.com/blevesearch/bleve/v2/mapping"
    36  	"github.com/blevesearch/bleve/v2/search/query"
    37  	"github.com/ethantkoenig/rupture"
    38  	"github.com/go-enry/go-enry/v2"
    39  )
    40  
    41  const (
    42  	unicodeNormalizeName = "unicodeNormalize"
    43  	maxBatchSize         = 16
    44  )
    45  
    46  // numericEqualityQuery a numeric equality query for the given value and field
    47  func numericEqualityQuery(value int64, field string) *query.NumericRangeQuery {
    48  	f := float64(value)
    49  	tru := true
    50  	q := bleve.NewNumericRangeInclusiveQuery(&f, &f, &tru, &tru)
    51  	q.SetField(field)
    52  	return q
    53  }
    54  
    55  func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
    56  	return m.AddCustomTokenFilter(unicodeNormalizeName, map[string]interface{}{
    57  		"type": unicodenorm.Name,
    58  		"form": unicodenorm.NFC,
    59  	})
    60  }
    61  
    62  // openBleveIndexer open the index at the specified path, checking for metadata
    63  // updates and bleve version updates.  If index needs to be created (or
    64  // re-created), returns (nil, nil)
    65  func openBleveIndexer(path string, latestVersion int) (bleve.Index, error) {
    66  	_, err := os.Stat(path)
    67  	if err != nil && os.IsNotExist(err) {
    68  		return nil, nil
    69  	} else if err != nil {
    70  		return nil, err
    71  	}
    72  
    73  	metadata, err := rupture.ReadIndexMetadata(path)
    74  	if err != nil {
    75  		return nil, err
    76  	}
    77  	if metadata.Version < latestVersion {
    78  		// the indexer is using a previous version, so we should delete it and
    79  		// re-populate
    80  		return nil, util.RemoveAll(path)
    81  	}
    82  
    83  	index, err := bleve.Open(path)
    84  	if err != nil && err == upsidedown.IncompatibleVersion {
    85  		// the indexer was built with a previous version of bleve, so we should
    86  		// delete it and re-populate
    87  		return nil, util.RemoveAll(path)
    88  	} else if err != nil {
    89  		return nil, err
    90  	}
    91  	return index, nil
    92  }
    93  
    94  // RepoIndexerData data stored in the repo indexer
    95  type RepoIndexerData struct {
    96  	RepoID    int64
    97  	CommitID  string
    98  	Content   string
    99  	Language  string
   100  	UpdatedAt time.Time
   101  }
   102  
   103  // Type returns the document type, for bleve's mapping.Classifier interface.
   104  func (d *RepoIndexerData) Type() string {
   105  	return repoIndexerDocType
   106  }
   107  
   108  const (
   109  	repoIndexerAnalyzer      = "repoIndexerAnalyzer"
   110  	repoIndexerDocType       = "repoIndexerDocType"
   111  	repoIndexerLatestVersion = 6
   112  )
   113  
   114  // createBleveIndexer create a bleve repo indexer if one does not already exist
   115  func createBleveIndexer(path string, latestVersion int) (bleve.Index, error) {
   116  	docMapping := bleve.NewDocumentMapping()
   117  	numericFieldMapping := bleve.NewNumericFieldMapping()
   118  	numericFieldMapping.IncludeInAll = false
   119  	docMapping.AddFieldMappingsAt("RepoID", numericFieldMapping)
   120  
   121  	textFieldMapping := bleve.NewTextFieldMapping()
   122  	textFieldMapping.IncludeInAll = false
   123  	docMapping.AddFieldMappingsAt("Content", textFieldMapping)
   124  
   125  	termFieldMapping := bleve.NewTextFieldMapping()
   126  	termFieldMapping.IncludeInAll = false
   127  	termFieldMapping.Analyzer = analyzer_keyword.Name
   128  	docMapping.AddFieldMappingsAt("Language", termFieldMapping)
   129  	docMapping.AddFieldMappingsAt("CommitID", termFieldMapping)
   130  
   131  	timeFieldMapping := bleve.NewDateTimeFieldMapping()
   132  	timeFieldMapping.IncludeInAll = false
   133  	docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping)
   134  
   135  	mapping := bleve.NewIndexMapping()
   136  	if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
   137  		return nil, err
   138  	} else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]interface{}{
   139  		"type":          analyzer_custom.Name,
   140  		"char_filters":  []string{},
   141  		"tokenizer":     unicode.Name,
   142  		"token_filters": []string{unicodeNormalizeName, camelcase.Name, lowercase.Name},
   143  	}); err != nil {
   144  		return nil, err
   145  	}
   146  	mapping.DefaultAnalyzer = repoIndexerAnalyzer
   147  	mapping.AddDocumentMapping(repoIndexerDocType, docMapping)
   148  	mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping())
   149  
   150  	indexer, err := bleve.New(path, mapping)
   151  	if err != nil {
   152  		return nil, err
   153  	}
   154  
   155  	if err = rupture.WriteIndexMetadata(path, &rupture.IndexMetadata{
   156  		Version: latestVersion,
   157  	}); err != nil {
   158  		return nil, err
   159  	}
   160  	return indexer, nil
   161  }
   162  
   163  var _ Indexer = &BleveIndexer{}
   164  
   165  // BleveIndexer represents a bleve indexer implementation
   166  type BleveIndexer struct {
   167  	indexDir string
   168  	indexer  bleve.Index
   169  }
   170  
   171  // NewBleveIndexer creates a new bleve local indexer
   172  func NewBleveIndexer(indexDir string) (*BleveIndexer, bool, error) {
   173  	indexer := &BleveIndexer{
   174  		indexDir: indexDir,
   175  	}
   176  	created, err := indexer.init()
   177  	if err != nil {
   178  		indexer.Close()
   179  		return nil, false, err
   180  	}
   181  	return indexer, created, err
   182  }
   183  
   184  func (b *BleveIndexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserError, batchReader *bufio.Reader, commitSha string,
   185  	update fileUpdate, repo *repo_model.Repository, batch *gitea_bleve.FlushingBatch,
   186  ) error {
   187  	// Ignore vendored files in code search
   188  	if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) {
   189  		return nil
   190  	}
   191  
   192  	size := update.Size
   193  
   194  	var err error
   195  	if !update.Sized {
   196  		var stdout string
   197  		stdout, _, err = git.NewCommand(ctx, "cat-file", "-s").AddDynamicArguments(update.BlobSha).RunStdString(&git.RunOpts{Dir: repo.RepoPath()})
   198  		if err != nil {
   199  			return err
   200  		}
   201  		if size, err = strconv.ParseInt(strings.TrimSpace(stdout), 10, 64); err != nil {
   202  			return fmt.Errorf("Misformatted git cat-file output: %w", err)
   203  		}
   204  	}
   205  
   206  	if size > setting.Indexer.MaxIndexerFileSize {
   207  		return b.addDelete(update.Filename, repo, batch)
   208  	}
   209  
   210  	if _, err := batchWriter.Write([]byte(update.BlobSha + "\n")); err != nil {
   211  		return err
   212  	}
   213  
   214  	_, _, size, err = git.ReadBatchLine(batchReader)
   215  	if err != nil {
   216  		return err
   217  	}
   218  
   219  	fileContents, err := io.ReadAll(io.LimitReader(batchReader, size))
   220  	if err != nil {
   221  		return err
   222  	} else if !typesniffer.DetectContentType(fileContents).IsText() {
   223  		// FIXME: UTF-16 files will probably fail here
   224  		return nil
   225  	}
   226  
   227  	if _, err = batchReader.Discard(1); err != nil {
   228  		return err
   229  	}
   230  	id := filenameIndexerID(repo.ID, update.Filename)
   231  	return batch.Index(id, &RepoIndexerData{
   232  		RepoID:    repo.ID,
   233  		CommitID:  commitSha,
   234  		Content:   string(charset.ToUTF8DropErrors(fileContents)),
   235  		Language:  analyze.GetCodeLanguage(update.Filename, fileContents),
   236  		UpdatedAt: time.Now().UTC(),
   237  	})
   238  }
   239  
   240  func (b *BleveIndexer) addDelete(filename string, repo *repo_model.Repository, batch *gitea_bleve.FlushingBatch) error {
   241  	id := filenameIndexerID(repo.ID, filename)
   242  	return batch.Delete(id)
   243  }
   244  
   245  // init init the indexer
   246  func (b *BleveIndexer) init() (bool, error) {
   247  	var err error
   248  	b.indexer, err = openBleveIndexer(b.indexDir, repoIndexerLatestVersion)
   249  	if err != nil {
   250  		return false, err
   251  	}
   252  	if b.indexer != nil {
   253  		return false, nil
   254  	}
   255  
   256  	b.indexer, err = createBleveIndexer(b.indexDir, repoIndexerLatestVersion)
   257  	if err != nil {
   258  		return false, err
   259  	}
   260  
   261  	return true, nil
   262  }
   263  
   264  // Close close the indexer
   265  func (b *BleveIndexer) Close() {
   266  	log.Debug("Closing repo indexer")
   267  	if b.indexer != nil {
   268  		err := b.indexer.Close()
   269  		if err != nil {
   270  			log.Error("Error whilst closing the repository indexer: %v", err)
   271  		}
   272  	}
   273  	log.Info("PID: %d Repository Indexer closed", os.Getpid())
   274  }
   275  
   276  // SetAvailabilityChangeCallback does nothing
   277  func (b *BleveIndexer) SetAvailabilityChangeCallback(callback func(bool)) {
   278  }
   279  
   280  // Ping does nothing
   281  func (b *BleveIndexer) Ping() bool {
   282  	return true
   283  }
   284  
   285  // Index indexes the data
   286  func (b *BleveIndexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *repoChanges) error {
   287  	batch := gitea_bleve.NewFlushingBatch(b.indexer, maxBatchSize)
   288  	if len(changes.Updates) > 0 {
   289  
   290  		// Now because of some insanity with git cat-file not immediately failing if not run in a valid git directory we need to run git rev-parse first!
   291  		if err := git.EnsureValidGitRepository(ctx, repo.RepoPath()); err != nil {
   292  			log.Error("Unable to open git repo: %s for %-v: %v", repo.RepoPath(), repo, err)
   293  			return err
   294  		}
   295  
   296  		batchWriter, batchReader, cancel := git.CatFileBatch(ctx, repo.RepoPath())
   297  		defer cancel()
   298  
   299  		for _, update := range changes.Updates {
   300  			if err := b.addUpdate(ctx, batchWriter, batchReader, sha, update, repo, batch); err != nil {
   301  				return err
   302  			}
   303  		}
   304  		cancel()
   305  	}
   306  	for _, filename := range changes.RemovedFilenames {
   307  		if err := b.addDelete(filename, repo, batch); err != nil {
   308  			return err
   309  		}
   310  	}
   311  	return batch.Flush()
   312  }
   313  
   314  // Delete deletes indexes by ids
   315  func (b *BleveIndexer) Delete(repoID int64) error {
   316  	query := numericEqualityQuery(repoID, "RepoID")
   317  	searchRequest := bleve.NewSearchRequestOptions(query, 2147483647, 0, false)
   318  	result, err := b.indexer.Search(searchRequest)
   319  	if err != nil {
   320  		return err
   321  	}
   322  	batch := gitea_bleve.NewFlushingBatch(b.indexer, maxBatchSize)
   323  	for _, hit := range result.Hits {
   324  		if err = batch.Delete(hit.ID); err != nil {
   325  			return err
   326  		}
   327  	}
   328  	return batch.Flush()
   329  }
   330  
   331  // Search searches for files in the specified repo.
   332  // Returns the matching file-paths
   333  func (b *BleveIndexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*SearchResult, []*SearchResultLanguages, error) {
   334  	var (
   335  		indexerQuery query.Query
   336  		keywordQuery query.Query
   337  	)
   338  
   339  	if isMatch {
   340  		prefixQuery := bleve.NewPrefixQuery(keyword)
   341  		prefixQuery.FieldVal = "Content"
   342  		keywordQuery = prefixQuery
   343  	} else {
   344  		phraseQuery := bleve.NewMatchPhraseQuery(keyword)
   345  		phraseQuery.FieldVal = "Content"
   346  		phraseQuery.Analyzer = repoIndexerAnalyzer
   347  		keywordQuery = phraseQuery
   348  	}
   349  
   350  	if len(repoIDs) > 0 {
   351  		repoQueries := make([]query.Query, 0, len(repoIDs))
   352  		for _, repoID := range repoIDs {
   353  			repoQueries = append(repoQueries, numericEqualityQuery(repoID, "RepoID"))
   354  		}
   355  
   356  		indexerQuery = bleve.NewConjunctionQuery(
   357  			bleve.NewDisjunctionQuery(repoQueries...),
   358  			keywordQuery,
   359  		)
   360  	} else {
   361  		indexerQuery = keywordQuery
   362  	}
   363  
   364  	// Save for reuse without language filter
   365  	facetQuery := indexerQuery
   366  	if len(language) > 0 {
   367  		languageQuery := bleve.NewMatchQuery(language)
   368  		languageQuery.FieldVal = "Language"
   369  		languageQuery.Analyzer = analyzer_keyword.Name
   370  
   371  		indexerQuery = bleve.NewConjunctionQuery(
   372  			indexerQuery,
   373  			languageQuery,
   374  		)
   375  	}
   376  
   377  	from := (page - 1) * pageSize
   378  	searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false)
   379  	searchRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
   380  	searchRequest.IncludeLocations = true
   381  
   382  	if len(language) == 0 {
   383  		searchRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
   384  	}
   385  
   386  	result, err := b.indexer.SearchInContext(ctx, searchRequest)
   387  	if err != nil {
   388  		return 0, nil, nil, err
   389  	}
   390  
   391  	total := int64(result.Total)
   392  
   393  	searchResults := make([]*SearchResult, len(result.Hits))
   394  	for i, hit := range result.Hits {
   395  		startIndex, endIndex := -1, -1
   396  		for _, locations := range hit.Locations["Content"] {
   397  			location := locations[0]
   398  			locationStart := int(location.Start)
   399  			locationEnd := int(location.End)
   400  			if startIndex < 0 || locationStart < startIndex {
   401  				startIndex = locationStart
   402  			}
   403  			if endIndex < 0 || locationEnd > endIndex {
   404  				endIndex = locationEnd
   405  			}
   406  		}
   407  		language := hit.Fields["Language"].(string)
   408  		var updatedUnix timeutil.TimeStamp
   409  		if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil {
   410  			updatedUnix = timeutil.TimeStamp(t.Unix())
   411  		}
   412  		searchResults[i] = &SearchResult{
   413  			RepoID:      int64(hit.Fields["RepoID"].(float64)),
   414  			StartIndex:  startIndex,
   415  			EndIndex:    endIndex,
   416  			Filename:    filenameOfIndexerID(hit.ID),
   417  			Content:     hit.Fields["Content"].(string),
   418  			CommitID:    hit.Fields["CommitID"].(string),
   419  			UpdatedUnix: updatedUnix,
   420  			Language:    language,
   421  			Color:       enry.GetColor(language),
   422  		}
   423  	}
   424  
   425  	searchResultLanguages := make([]*SearchResultLanguages, 0, 10)
   426  	if len(language) > 0 {
   427  		// Use separate query to go get all language counts
   428  		facetRequest := bleve.NewSearchRequestOptions(facetQuery, 1, 0, false)
   429  		facetRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
   430  		facetRequest.IncludeLocations = true
   431  		facetRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
   432  
   433  		if result, err = b.indexer.Search(facetRequest); err != nil {
   434  			return 0, nil, nil, err
   435  		}
   436  
   437  	}
   438  	languagesFacet := result.Facets["languages"]
   439  	for _, term := range languagesFacet.Terms.Terms() {
   440  		if len(term.Term) == 0 {
   441  			continue
   442  		}
   443  		searchResultLanguages = append(searchResultLanguages, &SearchResultLanguages{
   444  			Language: term.Term,
   445  			Color:    enry.GetColor(term.Term),
   446  			Count:    term.Count,
   447  		})
   448  	}
   449  	return total, searchResults, searchResultLanguages, nil
   450  }