code.gitea.io/gitea@v1.22.3/modules/indexer/code/bleve/bleve.go (about) 1 // Copyright 2019 The Gitea Authors. All rights reserved. 2 // SPDX-License-Identifier: MIT 3 4 package bleve 5 6 import ( 7 "bufio" 8 "context" 9 "fmt" 10 "io" 11 "strconv" 12 "strings" 13 "time" 14 15 repo_model "code.gitea.io/gitea/models/repo" 16 "code.gitea.io/gitea/modules/analyze" 17 "code.gitea.io/gitea/modules/charset" 18 "code.gitea.io/gitea/modules/git" 19 "code.gitea.io/gitea/modules/gitrepo" 20 "code.gitea.io/gitea/modules/indexer/code/internal" 21 indexer_internal "code.gitea.io/gitea/modules/indexer/internal" 22 inner_bleve "code.gitea.io/gitea/modules/indexer/internal/bleve" 23 "code.gitea.io/gitea/modules/setting" 24 "code.gitea.io/gitea/modules/timeutil" 25 "code.gitea.io/gitea/modules/typesniffer" 26 27 "github.com/blevesearch/bleve/v2" 28 analyzer_custom "github.com/blevesearch/bleve/v2/analysis/analyzer/custom" 29 analyzer_keyword "github.com/blevesearch/bleve/v2/analysis/analyzer/keyword" 30 "github.com/blevesearch/bleve/v2/analysis/token/camelcase" 31 "github.com/blevesearch/bleve/v2/analysis/token/lowercase" 32 "github.com/blevesearch/bleve/v2/analysis/token/unicodenorm" 33 "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode" 34 "github.com/blevesearch/bleve/v2/mapping" 35 "github.com/blevesearch/bleve/v2/search/query" 36 "github.com/go-enry/go-enry/v2" 37 ) 38 39 const ( 40 unicodeNormalizeName = "unicodeNormalize" 41 maxBatchSize = 16 42 ) 43 44 func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error { 45 return m.AddCustomTokenFilter(unicodeNormalizeName, map[string]any{ 46 "type": unicodenorm.Name, 47 "form": unicodenorm.NFC, 48 }) 49 } 50 51 // RepoIndexerData data stored in the repo indexer 52 type RepoIndexerData struct { 53 RepoID int64 54 CommitID string 55 Content string 56 Language string 57 UpdatedAt time.Time 58 } 59 60 // Type returns the document type, for bleve's mapping.Classifier interface. 61 func (d *RepoIndexerData) Type() string { 62 return repoIndexerDocType 63 } 64 65 const ( 66 repoIndexerAnalyzer = "repoIndexerAnalyzer" 67 repoIndexerDocType = "repoIndexerDocType" 68 repoIndexerLatestVersion = 6 69 ) 70 71 // generateBleveIndexMapping generates a bleve index mapping for the repo indexer 72 func generateBleveIndexMapping() (mapping.IndexMapping, error) { 73 docMapping := bleve.NewDocumentMapping() 74 numericFieldMapping := bleve.NewNumericFieldMapping() 75 numericFieldMapping.IncludeInAll = false 76 docMapping.AddFieldMappingsAt("RepoID", numericFieldMapping) 77 78 textFieldMapping := bleve.NewTextFieldMapping() 79 textFieldMapping.IncludeInAll = false 80 docMapping.AddFieldMappingsAt("Content", textFieldMapping) 81 82 termFieldMapping := bleve.NewTextFieldMapping() 83 termFieldMapping.IncludeInAll = false 84 termFieldMapping.Analyzer = analyzer_keyword.Name 85 docMapping.AddFieldMappingsAt("Language", termFieldMapping) 86 docMapping.AddFieldMappingsAt("CommitID", termFieldMapping) 87 88 timeFieldMapping := bleve.NewDateTimeFieldMapping() 89 timeFieldMapping.IncludeInAll = false 90 docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping) 91 92 mapping := bleve.NewIndexMapping() 93 if err := addUnicodeNormalizeTokenFilter(mapping); err != nil { 94 return nil, err 95 } else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{ 96 "type": analyzer_custom.Name, 97 "char_filters": []string{}, 98 "tokenizer": unicode.Name, 99 "token_filters": []string{unicodeNormalizeName, camelcase.Name, lowercase.Name}, 100 }); err != nil { 101 return nil, err 102 } 103 mapping.DefaultAnalyzer = repoIndexerAnalyzer 104 mapping.AddDocumentMapping(repoIndexerDocType, docMapping) 105 mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping()) 106 107 return mapping, nil 108 } 109 110 var _ internal.Indexer = &Indexer{} 111 112 // Indexer represents a bleve indexer implementation 113 type Indexer struct { 114 inner *inner_bleve.Indexer 115 indexer_internal.Indexer // do not composite inner_bleve.Indexer directly to avoid exposing too much 116 } 117 118 // NewIndexer creates a new bleve local indexer 119 func NewIndexer(indexDir string) *Indexer { 120 inner := inner_bleve.NewIndexer(indexDir, repoIndexerLatestVersion, generateBleveIndexMapping) 121 return &Indexer{ 122 Indexer: inner, 123 inner: inner, 124 } 125 } 126 127 func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserError, batchReader *bufio.Reader, commitSha string, 128 update internal.FileUpdate, repo *repo_model.Repository, batch *inner_bleve.FlushingBatch, 129 ) error { 130 // Ignore vendored files in code search 131 if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) { 132 return nil 133 } 134 135 size := update.Size 136 137 var err error 138 if !update.Sized { 139 var stdout string 140 stdout, _, err = git.NewCommand(ctx, "cat-file", "-s").AddDynamicArguments(update.BlobSha).RunStdString(&git.RunOpts{Dir: repo.RepoPath()}) 141 if err != nil { 142 return err 143 } 144 if size, err = strconv.ParseInt(strings.TrimSpace(stdout), 10, 64); err != nil { 145 return fmt.Errorf("misformatted git cat-file output: %w", err) 146 } 147 } 148 149 if size > setting.Indexer.MaxIndexerFileSize { 150 return b.addDelete(update.Filename, repo, batch) 151 } 152 153 if _, err := batchWriter.Write([]byte(update.BlobSha + "\n")); err != nil { 154 return err 155 } 156 157 _, _, size, err = git.ReadBatchLine(batchReader) 158 if err != nil { 159 return err 160 } 161 162 fileContents, err := io.ReadAll(io.LimitReader(batchReader, size)) 163 if err != nil { 164 return err 165 } else if !typesniffer.DetectContentType(fileContents).IsText() { 166 // FIXME: UTF-16 files will probably fail here 167 return nil 168 } 169 170 if _, err = batchReader.Discard(1); err != nil { 171 return err 172 } 173 id := internal.FilenameIndexerID(repo.ID, update.Filename) 174 return batch.Index(id, &RepoIndexerData{ 175 RepoID: repo.ID, 176 CommitID: commitSha, 177 Content: string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})), 178 Language: analyze.GetCodeLanguage(update.Filename, fileContents), 179 UpdatedAt: time.Now().UTC(), 180 }) 181 } 182 183 func (b *Indexer) addDelete(filename string, repo *repo_model.Repository, batch *inner_bleve.FlushingBatch) error { 184 id := internal.FilenameIndexerID(repo.ID, filename) 185 return batch.Delete(id) 186 } 187 188 // Index indexes the data 189 func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *internal.RepoChanges) error { 190 batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize) 191 if len(changes.Updates) > 0 { 192 r, err := gitrepo.OpenRepository(ctx, repo) 193 if err != nil { 194 return err 195 } 196 defer r.Close() 197 gitBatch, err := r.NewBatch(ctx) 198 if err != nil { 199 return err 200 } 201 defer gitBatch.Close() 202 203 for _, update := range changes.Updates { 204 if err := b.addUpdate(ctx, gitBatch.Writer, gitBatch.Reader, sha, update, repo, batch); err != nil { 205 return err 206 } 207 } 208 gitBatch.Close() 209 } 210 for _, filename := range changes.RemovedFilenames { 211 if err := b.addDelete(filename, repo, batch); err != nil { 212 return err 213 } 214 } 215 return batch.Flush() 216 } 217 218 // Delete deletes indexes by ids 219 func (b *Indexer) Delete(_ context.Context, repoID int64) error { 220 query := inner_bleve.NumericEqualityQuery(repoID, "RepoID") 221 searchRequest := bleve.NewSearchRequestOptions(query, 2147483647, 0, false) 222 result, err := b.inner.Indexer.Search(searchRequest) 223 if err != nil { 224 return err 225 } 226 batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize) 227 for _, hit := range result.Hits { 228 if err = batch.Delete(hit.ID); err != nil { 229 return err 230 } 231 } 232 return batch.Flush() 233 } 234 235 // Search searches for files in the specified repo. 236 // Returns the matching file-paths 237 func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) { 238 var ( 239 indexerQuery query.Query 240 keywordQuery query.Query 241 ) 242 243 phraseQuery := bleve.NewMatchPhraseQuery(opts.Keyword) 244 phraseQuery.FieldVal = "Content" 245 phraseQuery.Analyzer = repoIndexerAnalyzer 246 keywordQuery = phraseQuery 247 if opts.IsKeywordFuzzy { 248 phraseQuery.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword) 249 } 250 251 if len(opts.RepoIDs) > 0 { 252 repoQueries := make([]query.Query, 0, len(opts.RepoIDs)) 253 for _, repoID := range opts.RepoIDs { 254 repoQueries = append(repoQueries, inner_bleve.NumericEqualityQuery(repoID, "RepoID")) 255 } 256 257 indexerQuery = bleve.NewConjunctionQuery( 258 bleve.NewDisjunctionQuery(repoQueries...), 259 keywordQuery, 260 ) 261 } else { 262 indexerQuery = keywordQuery 263 } 264 265 // Save for reuse without language filter 266 facetQuery := indexerQuery 267 if len(opts.Language) > 0 { 268 languageQuery := bleve.NewMatchQuery(opts.Language) 269 languageQuery.FieldVal = "Language" 270 languageQuery.Analyzer = analyzer_keyword.Name 271 272 indexerQuery = bleve.NewConjunctionQuery( 273 indexerQuery, 274 languageQuery, 275 ) 276 } 277 278 from, pageSize := opts.GetSkipTake() 279 searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false) 280 searchRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"} 281 searchRequest.IncludeLocations = true 282 283 if len(opts.Language) == 0 { 284 searchRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10)) 285 } 286 287 result, err := b.inner.Indexer.SearchInContext(ctx, searchRequest) 288 if err != nil { 289 return 0, nil, nil, err 290 } 291 292 total := int64(result.Total) 293 294 searchResults := make([]*internal.SearchResult, len(result.Hits)) 295 for i, hit := range result.Hits { 296 startIndex, endIndex := -1, -1 297 for _, locations := range hit.Locations["Content"] { 298 location := locations[0] 299 locationStart := int(location.Start) 300 locationEnd := int(location.End) 301 if startIndex < 0 || locationStart < startIndex { 302 startIndex = locationStart 303 } 304 if endIndex < 0 || locationEnd > endIndex { 305 endIndex = locationEnd 306 } 307 } 308 language := hit.Fields["Language"].(string) 309 var updatedUnix timeutil.TimeStamp 310 if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil { 311 updatedUnix = timeutil.TimeStamp(t.Unix()) 312 } 313 searchResults[i] = &internal.SearchResult{ 314 RepoID: int64(hit.Fields["RepoID"].(float64)), 315 StartIndex: startIndex, 316 EndIndex: endIndex, 317 Filename: internal.FilenameOfIndexerID(hit.ID), 318 Content: hit.Fields["Content"].(string), 319 CommitID: hit.Fields["CommitID"].(string), 320 UpdatedUnix: updatedUnix, 321 Language: language, 322 Color: enry.GetColor(language), 323 } 324 } 325 326 searchResultLanguages := make([]*internal.SearchResultLanguages, 0, 10) 327 if len(opts.Language) > 0 { 328 // Use separate query to go get all language counts 329 facetRequest := bleve.NewSearchRequestOptions(facetQuery, 1, 0, false) 330 facetRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"} 331 facetRequest.IncludeLocations = true 332 facetRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10)) 333 334 if result, err = b.inner.Indexer.Search(facetRequest); err != nil { 335 return 0, nil, nil, err 336 } 337 } 338 languagesFacet := result.Facets["languages"] 339 for _, term := range languagesFacet.Terms.Terms() { 340 if len(term.Term) == 0 { 341 continue 342 } 343 searchResultLanguages = append(searchResultLanguages, &internal.SearchResultLanguages{ 344 Language: term.Term, 345 Color: enry.GetColor(term.Term), 346 Count: term.Count, 347 }) 348 } 349 return total, searchResults, searchResultLanguages, nil 350 }