code.gitea.io/gitea@v1.22.3/modules/indexer/code/bleve/bleve.go (about)

     1  // Copyright 2019 The Gitea Authors. All rights reserved.
     2  // SPDX-License-Identifier: MIT
     3  
     4  package bleve
     5  
     6  import (
     7  	"bufio"
     8  	"context"
     9  	"fmt"
    10  	"io"
    11  	"strconv"
    12  	"strings"
    13  	"time"
    14  
    15  	repo_model "code.gitea.io/gitea/models/repo"
    16  	"code.gitea.io/gitea/modules/analyze"
    17  	"code.gitea.io/gitea/modules/charset"
    18  	"code.gitea.io/gitea/modules/git"
    19  	"code.gitea.io/gitea/modules/gitrepo"
    20  	"code.gitea.io/gitea/modules/indexer/code/internal"
    21  	indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
    22  	inner_bleve "code.gitea.io/gitea/modules/indexer/internal/bleve"
    23  	"code.gitea.io/gitea/modules/setting"
    24  	"code.gitea.io/gitea/modules/timeutil"
    25  	"code.gitea.io/gitea/modules/typesniffer"
    26  
    27  	"github.com/blevesearch/bleve/v2"
    28  	analyzer_custom "github.com/blevesearch/bleve/v2/analysis/analyzer/custom"
    29  	analyzer_keyword "github.com/blevesearch/bleve/v2/analysis/analyzer/keyword"
    30  	"github.com/blevesearch/bleve/v2/analysis/token/camelcase"
    31  	"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
    32  	"github.com/blevesearch/bleve/v2/analysis/token/unicodenorm"
    33  	"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
    34  	"github.com/blevesearch/bleve/v2/mapping"
    35  	"github.com/blevesearch/bleve/v2/search/query"
    36  	"github.com/go-enry/go-enry/v2"
    37  )
    38  
    39  const (
    40  	unicodeNormalizeName = "unicodeNormalize"
    41  	maxBatchSize         = 16
    42  )
    43  
    44  func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
    45  	return m.AddCustomTokenFilter(unicodeNormalizeName, map[string]any{
    46  		"type": unicodenorm.Name,
    47  		"form": unicodenorm.NFC,
    48  	})
    49  }
    50  
    51  // RepoIndexerData data stored in the repo indexer
    52  type RepoIndexerData struct {
    53  	RepoID    int64
    54  	CommitID  string
    55  	Content   string
    56  	Language  string
    57  	UpdatedAt time.Time
    58  }
    59  
    60  // Type returns the document type, for bleve's mapping.Classifier interface.
    61  func (d *RepoIndexerData) Type() string {
    62  	return repoIndexerDocType
    63  }
    64  
    65  const (
    66  	repoIndexerAnalyzer      = "repoIndexerAnalyzer"
    67  	repoIndexerDocType       = "repoIndexerDocType"
    68  	repoIndexerLatestVersion = 6
    69  )
    70  
    71  // generateBleveIndexMapping generates a bleve index mapping for the repo indexer
    72  func generateBleveIndexMapping() (mapping.IndexMapping, error) {
    73  	docMapping := bleve.NewDocumentMapping()
    74  	numericFieldMapping := bleve.NewNumericFieldMapping()
    75  	numericFieldMapping.IncludeInAll = false
    76  	docMapping.AddFieldMappingsAt("RepoID", numericFieldMapping)
    77  
    78  	textFieldMapping := bleve.NewTextFieldMapping()
    79  	textFieldMapping.IncludeInAll = false
    80  	docMapping.AddFieldMappingsAt("Content", textFieldMapping)
    81  
    82  	termFieldMapping := bleve.NewTextFieldMapping()
    83  	termFieldMapping.IncludeInAll = false
    84  	termFieldMapping.Analyzer = analyzer_keyword.Name
    85  	docMapping.AddFieldMappingsAt("Language", termFieldMapping)
    86  	docMapping.AddFieldMappingsAt("CommitID", termFieldMapping)
    87  
    88  	timeFieldMapping := bleve.NewDateTimeFieldMapping()
    89  	timeFieldMapping.IncludeInAll = false
    90  	docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping)
    91  
    92  	mapping := bleve.NewIndexMapping()
    93  	if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
    94  		return nil, err
    95  	} else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{
    96  		"type":          analyzer_custom.Name,
    97  		"char_filters":  []string{},
    98  		"tokenizer":     unicode.Name,
    99  		"token_filters": []string{unicodeNormalizeName, camelcase.Name, lowercase.Name},
   100  	}); err != nil {
   101  		return nil, err
   102  	}
   103  	mapping.DefaultAnalyzer = repoIndexerAnalyzer
   104  	mapping.AddDocumentMapping(repoIndexerDocType, docMapping)
   105  	mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping())
   106  
   107  	return mapping, nil
   108  }
   109  
   110  var _ internal.Indexer = &Indexer{}
   111  
   112  // Indexer represents a bleve indexer implementation
   113  type Indexer struct {
   114  	inner                    *inner_bleve.Indexer
   115  	indexer_internal.Indexer // do not composite inner_bleve.Indexer directly to avoid exposing too much
   116  }
   117  
   118  // NewIndexer creates a new bleve local indexer
   119  func NewIndexer(indexDir string) *Indexer {
   120  	inner := inner_bleve.NewIndexer(indexDir, repoIndexerLatestVersion, generateBleveIndexMapping)
   121  	return &Indexer{
   122  		Indexer: inner,
   123  		inner:   inner,
   124  	}
   125  }
   126  
   127  func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserError, batchReader *bufio.Reader, commitSha string,
   128  	update internal.FileUpdate, repo *repo_model.Repository, batch *inner_bleve.FlushingBatch,
   129  ) error {
   130  	// Ignore vendored files in code search
   131  	if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) {
   132  		return nil
   133  	}
   134  
   135  	size := update.Size
   136  
   137  	var err error
   138  	if !update.Sized {
   139  		var stdout string
   140  		stdout, _, err = git.NewCommand(ctx, "cat-file", "-s").AddDynamicArguments(update.BlobSha).RunStdString(&git.RunOpts{Dir: repo.RepoPath()})
   141  		if err != nil {
   142  			return err
   143  		}
   144  		if size, err = strconv.ParseInt(strings.TrimSpace(stdout), 10, 64); err != nil {
   145  			return fmt.Errorf("misformatted git cat-file output: %w", err)
   146  		}
   147  	}
   148  
   149  	if size > setting.Indexer.MaxIndexerFileSize {
   150  		return b.addDelete(update.Filename, repo, batch)
   151  	}
   152  
   153  	if _, err := batchWriter.Write([]byte(update.BlobSha + "\n")); err != nil {
   154  		return err
   155  	}
   156  
   157  	_, _, size, err = git.ReadBatchLine(batchReader)
   158  	if err != nil {
   159  		return err
   160  	}
   161  
   162  	fileContents, err := io.ReadAll(io.LimitReader(batchReader, size))
   163  	if err != nil {
   164  		return err
   165  	} else if !typesniffer.DetectContentType(fileContents).IsText() {
   166  		// FIXME: UTF-16 files will probably fail here
   167  		return nil
   168  	}
   169  
   170  	if _, err = batchReader.Discard(1); err != nil {
   171  		return err
   172  	}
   173  	id := internal.FilenameIndexerID(repo.ID, update.Filename)
   174  	return batch.Index(id, &RepoIndexerData{
   175  		RepoID:    repo.ID,
   176  		CommitID:  commitSha,
   177  		Content:   string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
   178  		Language:  analyze.GetCodeLanguage(update.Filename, fileContents),
   179  		UpdatedAt: time.Now().UTC(),
   180  	})
   181  }
   182  
   183  func (b *Indexer) addDelete(filename string, repo *repo_model.Repository, batch *inner_bleve.FlushingBatch) error {
   184  	id := internal.FilenameIndexerID(repo.ID, filename)
   185  	return batch.Delete(id)
   186  }
   187  
   188  // Index indexes the data
   189  func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *internal.RepoChanges) error {
   190  	batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize)
   191  	if len(changes.Updates) > 0 {
   192  		r, err := gitrepo.OpenRepository(ctx, repo)
   193  		if err != nil {
   194  			return err
   195  		}
   196  		defer r.Close()
   197  		gitBatch, err := r.NewBatch(ctx)
   198  		if err != nil {
   199  			return err
   200  		}
   201  		defer gitBatch.Close()
   202  
   203  		for _, update := range changes.Updates {
   204  			if err := b.addUpdate(ctx, gitBatch.Writer, gitBatch.Reader, sha, update, repo, batch); err != nil {
   205  				return err
   206  			}
   207  		}
   208  		gitBatch.Close()
   209  	}
   210  	for _, filename := range changes.RemovedFilenames {
   211  		if err := b.addDelete(filename, repo, batch); err != nil {
   212  			return err
   213  		}
   214  	}
   215  	return batch.Flush()
   216  }
   217  
   218  // Delete deletes indexes by ids
   219  func (b *Indexer) Delete(_ context.Context, repoID int64) error {
   220  	query := inner_bleve.NumericEqualityQuery(repoID, "RepoID")
   221  	searchRequest := bleve.NewSearchRequestOptions(query, 2147483647, 0, false)
   222  	result, err := b.inner.Indexer.Search(searchRequest)
   223  	if err != nil {
   224  		return err
   225  	}
   226  	batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize)
   227  	for _, hit := range result.Hits {
   228  		if err = batch.Delete(hit.ID); err != nil {
   229  			return err
   230  		}
   231  	}
   232  	return batch.Flush()
   233  }
   234  
   235  // Search searches for files in the specified repo.
   236  // Returns the matching file-paths
   237  func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
   238  	var (
   239  		indexerQuery query.Query
   240  		keywordQuery query.Query
   241  	)
   242  
   243  	phraseQuery := bleve.NewMatchPhraseQuery(opts.Keyword)
   244  	phraseQuery.FieldVal = "Content"
   245  	phraseQuery.Analyzer = repoIndexerAnalyzer
   246  	keywordQuery = phraseQuery
   247  	if opts.IsKeywordFuzzy {
   248  		phraseQuery.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
   249  	}
   250  
   251  	if len(opts.RepoIDs) > 0 {
   252  		repoQueries := make([]query.Query, 0, len(opts.RepoIDs))
   253  		for _, repoID := range opts.RepoIDs {
   254  			repoQueries = append(repoQueries, inner_bleve.NumericEqualityQuery(repoID, "RepoID"))
   255  		}
   256  
   257  		indexerQuery = bleve.NewConjunctionQuery(
   258  			bleve.NewDisjunctionQuery(repoQueries...),
   259  			keywordQuery,
   260  		)
   261  	} else {
   262  		indexerQuery = keywordQuery
   263  	}
   264  
   265  	// Save for reuse without language filter
   266  	facetQuery := indexerQuery
   267  	if len(opts.Language) > 0 {
   268  		languageQuery := bleve.NewMatchQuery(opts.Language)
   269  		languageQuery.FieldVal = "Language"
   270  		languageQuery.Analyzer = analyzer_keyword.Name
   271  
   272  		indexerQuery = bleve.NewConjunctionQuery(
   273  			indexerQuery,
   274  			languageQuery,
   275  		)
   276  	}
   277  
   278  	from, pageSize := opts.GetSkipTake()
   279  	searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false)
   280  	searchRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
   281  	searchRequest.IncludeLocations = true
   282  
   283  	if len(opts.Language) == 0 {
   284  		searchRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
   285  	}
   286  
   287  	result, err := b.inner.Indexer.SearchInContext(ctx, searchRequest)
   288  	if err != nil {
   289  		return 0, nil, nil, err
   290  	}
   291  
   292  	total := int64(result.Total)
   293  
   294  	searchResults := make([]*internal.SearchResult, len(result.Hits))
   295  	for i, hit := range result.Hits {
   296  		startIndex, endIndex := -1, -1
   297  		for _, locations := range hit.Locations["Content"] {
   298  			location := locations[0]
   299  			locationStart := int(location.Start)
   300  			locationEnd := int(location.End)
   301  			if startIndex < 0 || locationStart < startIndex {
   302  				startIndex = locationStart
   303  			}
   304  			if endIndex < 0 || locationEnd > endIndex {
   305  				endIndex = locationEnd
   306  			}
   307  		}
   308  		language := hit.Fields["Language"].(string)
   309  		var updatedUnix timeutil.TimeStamp
   310  		if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil {
   311  			updatedUnix = timeutil.TimeStamp(t.Unix())
   312  		}
   313  		searchResults[i] = &internal.SearchResult{
   314  			RepoID:      int64(hit.Fields["RepoID"].(float64)),
   315  			StartIndex:  startIndex,
   316  			EndIndex:    endIndex,
   317  			Filename:    internal.FilenameOfIndexerID(hit.ID),
   318  			Content:     hit.Fields["Content"].(string),
   319  			CommitID:    hit.Fields["CommitID"].(string),
   320  			UpdatedUnix: updatedUnix,
   321  			Language:    language,
   322  			Color:       enry.GetColor(language),
   323  		}
   324  	}
   325  
   326  	searchResultLanguages := make([]*internal.SearchResultLanguages, 0, 10)
   327  	if len(opts.Language) > 0 {
   328  		// Use separate query to go get all language counts
   329  		facetRequest := bleve.NewSearchRequestOptions(facetQuery, 1, 0, false)
   330  		facetRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
   331  		facetRequest.IncludeLocations = true
   332  		facetRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
   333  
   334  		if result, err = b.inner.Indexer.Search(facetRequest); err != nil {
   335  			return 0, nil, nil, err
   336  		}
   337  	}
   338  	languagesFacet := result.Facets["languages"]
   339  	for _, term := range languagesFacet.Terms.Terms() {
   340  		if len(term.Term) == 0 {
   341  			continue
   342  		}
   343  		searchResultLanguages = append(searchResultLanguages, &internal.SearchResultLanguages{
   344  			Language: term.Term,
   345  			Color:    enry.GetColor(term.Term),
   346  			Count:    term.Count,
   347  		})
   348  	}
   349  	return total, searchResults, searchResultLanguages, nil
   350  }