code.gitea.io/gitea@v1.19.3/modules/indexer/code/bleve.go (about) 1 // Copyright 2019 The Gitea Authors. All rights reserved. 2 // SPDX-License-Identifier: MIT 3 4 package code 5 6 import ( 7 "bufio" 8 "context" 9 "fmt" 10 "io" 11 "os" 12 "strconv" 13 "strings" 14 "time" 15 16 repo_model "code.gitea.io/gitea/models/repo" 17 "code.gitea.io/gitea/modules/analyze" 18 "code.gitea.io/gitea/modules/charset" 19 "code.gitea.io/gitea/modules/git" 20 gitea_bleve "code.gitea.io/gitea/modules/indexer/bleve" 21 "code.gitea.io/gitea/modules/log" 22 "code.gitea.io/gitea/modules/setting" 23 "code.gitea.io/gitea/modules/timeutil" 24 "code.gitea.io/gitea/modules/typesniffer" 25 "code.gitea.io/gitea/modules/util" 26 27 "github.com/blevesearch/bleve/v2" 28 analyzer_custom "github.com/blevesearch/bleve/v2/analysis/analyzer/custom" 29 analyzer_keyword "github.com/blevesearch/bleve/v2/analysis/analyzer/keyword" 30 "github.com/blevesearch/bleve/v2/analysis/token/camelcase" 31 "github.com/blevesearch/bleve/v2/analysis/token/lowercase" 32 "github.com/blevesearch/bleve/v2/analysis/token/unicodenorm" 33 "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode" 34 "github.com/blevesearch/bleve/v2/index/upsidedown" 35 "github.com/blevesearch/bleve/v2/mapping" 36 "github.com/blevesearch/bleve/v2/search/query" 37 "github.com/ethantkoenig/rupture" 38 "github.com/go-enry/go-enry/v2" 39 ) 40 41 const ( 42 unicodeNormalizeName = "unicodeNormalize" 43 maxBatchSize = 16 44 ) 45 46 // numericEqualityQuery a numeric equality query for the given value and field 47 func numericEqualityQuery(value int64, field string) *query.NumericRangeQuery { 48 f := float64(value) 49 tru := true 50 q := bleve.NewNumericRangeInclusiveQuery(&f, &f, &tru, &tru) 51 q.SetField(field) 52 return q 53 } 54 55 func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error { 56 return m.AddCustomTokenFilter(unicodeNormalizeName, map[string]interface{}{ 57 "type": unicodenorm.Name, 58 "form": unicodenorm.NFC, 59 }) 60 } 61 62 // openBleveIndexer open the index at the specified path, checking for metadata 63 // updates and bleve version updates. If index needs to be created (or 64 // re-created), returns (nil, nil) 65 func openBleveIndexer(path string, latestVersion int) (bleve.Index, error) { 66 _, err := os.Stat(path) 67 if err != nil && os.IsNotExist(err) { 68 return nil, nil 69 } else if err != nil { 70 return nil, err 71 } 72 73 metadata, err := rupture.ReadIndexMetadata(path) 74 if err != nil { 75 return nil, err 76 } 77 if metadata.Version < latestVersion { 78 // the indexer is using a previous version, so we should delete it and 79 // re-populate 80 return nil, util.RemoveAll(path) 81 } 82 83 index, err := bleve.Open(path) 84 if err != nil && err == upsidedown.IncompatibleVersion { 85 // the indexer was built with a previous version of bleve, so we should 86 // delete it and re-populate 87 return nil, util.RemoveAll(path) 88 } else if err != nil { 89 return nil, err 90 } 91 return index, nil 92 } 93 94 // RepoIndexerData data stored in the repo indexer 95 type RepoIndexerData struct { 96 RepoID int64 97 CommitID string 98 Content string 99 Language string 100 UpdatedAt time.Time 101 } 102 103 // Type returns the document type, for bleve's mapping.Classifier interface. 104 func (d *RepoIndexerData) Type() string { 105 return repoIndexerDocType 106 } 107 108 const ( 109 repoIndexerAnalyzer = "repoIndexerAnalyzer" 110 repoIndexerDocType = "repoIndexerDocType" 111 repoIndexerLatestVersion = 6 112 ) 113 114 // createBleveIndexer create a bleve repo indexer if one does not already exist 115 func createBleveIndexer(path string, latestVersion int) (bleve.Index, error) { 116 docMapping := bleve.NewDocumentMapping() 117 numericFieldMapping := bleve.NewNumericFieldMapping() 118 numericFieldMapping.IncludeInAll = false 119 docMapping.AddFieldMappingsAt("RepoID", numericFieldMapping) 120 121 textFieldMapping := bleve.NewTextFieldMapping() 122 textFieldMapping.IncludeInAll = false 123 docMapping.AddFieldMappingsAt("Content", textFieldMapping) 124 125 termFieldMapping := bleve.NewTextFieldMapping() 126 termFieldMapping.IncludeInAll = false 127 termFieldMapping.Analyzer = analyzer_keyword.Name 128 docMapping.AddFieldMappingsAt("Language", termFieldMapping) 129 docMapping.AddFieldMappingsAt("CommitID", termFieldMapping) 130 131 timeFieldMapping := bleve.NewDateTimeFieldMapping() 132 timeFieldMapping.IncludeInAll = false 133 docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping) 134 135 mapping := bleve.NewIndexMapping() 136 if err := addUnicodeNormalizeTokenFilter(mapping); err != nil { 137 return nil, err 138 } else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]interface{}{ 139 "type": analyzer_custom.Name, 140 "char_filters": []string{}, 141 "tokenizer": unicode.Name, 142 "token_filters": []string{unicodeNormalizeName, camelcase.Name, lowercase.Name}, 143 }); err != nil { 144 return nil, err 145 } 146 mapping.DefaultAnalyzer = repoIndexerAnalyzer 147 mapping.AddDocumentMapping(repoIndexerDocType, docMapping) 148 mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping()) 149 150 indexer, err := bleve.New(path, mapping) 151 if err != nil { 152 return nil, err 153 } 154 155 if err = rupture.WriteIndexMetadata(path, &rupture.IndexMetadata{ 156 Version: latestVersion, 157 }); err != nil { 158 return nil, err 159 } 160 return indexer, nil 161 } 162 163 var _ Indexer = &BleveIndexer{} 164 165 // BleveIndexer represents a bleve indexer implementation 166 type BleveIndexer struct { 167 indexDir string 168 indexer bleve.Index 169 } 170 171 // NewBleveIndexer creates a new bleve local indexer 172 func NewBleveIndexer(indexDir string) (*BleveIndexer, bool, error) { 173 indexer := &BleveIndexer{ 174 indexDir: indexDir, 175 } 176 created, err := indexer.init() 177 if err != nil { 178 indexer.Close() 179 return nil, false, err 180 } 181 return indexer, created, err 182 } 183 184 func (b *BleveIndexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserError, batchReader *bufio.Reader, commitSha string, 185 update fileUpdate, repo *repo_model.Repository, batch *gitea_bleve.FlushingBatch, 186 ) error { 187 // Ignore vendored files in code search 188 if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) { 189 return nil 190 } 191 192 size := update.Size 193 194 var err error 195 if !update.Sized { 196 var stdout string 197 stdout, _, err = git.NewCommand(ctx, "cat-file", "-s").AddDynamicArguments(update.BlobSha).RunStdString(&git.RunOpts{Dir: repo.RepoPath()}) 198 if err != nil { 199 return err 200 } 201 if size, err = strconv.ParseInt(strings.TrimSpace(stdout), 10, 64); err != nil { 202 return fmt.Errorf("Misformatted git cat-file output: %w", err) 203 } 204 } 205 206 if size > setting.Indexer.MaxIndexerFileSize { 207 return b.addDelete(update.Filename, repo, batch) 208 } 209 210 if _, err := batchWriter.Write([]byte(update.BlobSha + "\n")); err != nil { 211 return err 212 } 213 214 _, _, size, err = git.ReadBatchLine(batchReader) 215 if err != nil { 216 return err 217 } 218 219 fileContents, err := io.ReadAll(io.LimitReader(batchReader, size)) 220 if err != nil { 221 return err 222 } else if !typesniffer.DetectContentType(fileContents).IsText() { 223 // FIXME: UTF-16 files will probably fail here 224 return nil 225 } 226 227 if _, err = batchReader.Discard(1); err != nil { 228 return err 229 } 230 id := filenameIndexerID(repo.ID, update.Filename) 231 return batch.Index(id, &RepoIndexerData{ 232 RepoID: repo.ID, 233 CommitID: commitSha, 234 Content: string(charset.ToUTF8DropErrors(fileContents)), 235 Language: analyze.GetCodeLanguage(update.Filename, fileContents), 236 UpdatedAt: time.Now().UTC(), 237 }) 238 } 239 240 func (b *BleveIndexer) addDelete(filename string, repo *repo_model.Repository, batch *gitea_bleve.FlushingBatch) error { 241 id := filenameIndexerID(repo.ID, filename) 242 return batch.Delete(id) 243 } 244 245 // init init the indexer 246 func (b *BleveIndexer) init() (bool, error) { 247 var err error 248 b.indexer, err = openBleveIndexer(b.indexDir, repoIndexerLatestVersion) 249 if err != nil { 250 return false, err 251 } 252 if b.indexer != nil { 253 return false, nil 254 } 255 256 b.indexer, err = createBleveIndexer(b.indexDir, repoIndexerLatestVersion) 257 if err != nil { 258 return false, err 259 } 260 261 return true, nil 262 } 263 264 // Close close the indexer 265 func (b *BleveIndexer) Close() { 266 log.Debug("Closing repo indexer") 267 if b.indexer != nil { 268 err := b.indexer.Close() 269 if err != nil { 270 log.Error("Error whilst closing the repository indexer: %v", err) 271 } 272 } 273 log.Info("PID: %d Repository Indexer closed", os.Getpid()) 274 } 275 276 // SetAvailabilityChangeCallback does nothing 277 func (b *BleveIndexer) SetAvailabilityChangeCallback(callback func(bool)) { 278 } 279 280 // Ping does nothing 281 func (b *BleveIndexer) Ping() bool { 282 return true 283 } 284 285 // Index indexes the data 286 func (b *BleveIndexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *repoChanges) error { 287 batch := gitea_bleve.NewFlushingBatch(b.indexer, maxBatchSize) 288 if len(changes.Updates) > 0 { 289 290 // Now because of some insanity with git cat-file not immediately failing if not run in a valid git directory we need to run git rev-parse first! 291 if err := git.EnsureValidGitRepository(ctx, repo.RepoPath()); err != nil { 292 log.Error("Unable to open git repo: %s for %-v: %v", repo.RepoPath(), repo, err) 293 return err 294 } 295 296 batchWriter, batchReader, cancel := git.CatFileBatch(ctx, repo.RepoPath()) 297 defer cancel() 298 299 for _, update := range changes.Updates { 300 if err := b.addUpdate(ctx, batchWriter, batchReader, sha, update, repo, batch); err != nil { 301 return err 302 } 303 } 304 cancel() 305 } 306 for _, filename := range changes.RemovedFilenames { 307 if err := b.addDelete(filename, repo, batch); err != nil { 308 return err 309 } 310 } 311 return batch.Flush() 312 } 313 314 // Delete deletes indexes by ids 315 func (b *BleveIndexer) Delete(repoID int64) error { 316 query := numericEqualityQuery(repoID, "RepoID") 317 searchRequest := bleve.NewSearchRequestOptions(query, 2147483647, 0, false) 318 result, err := b.indexer.Search(searchRequest) 319 if err != nil { 320 return err 321 } 322 batch := gitea_bleve.NewFlushingBatch(b.indexer, maxBatchSize) 323 for _, hit := range result.Hits { 324 if err = batch.Delete(hit.ID); err != nil { 325 return err 326 } 327 } 328 return batch.Flush() 329 } 330 331 // Search searches for files in the specified repo. 332 // Returns the matching file-paths 333 func (b *BleveIndexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*SearchResult, []*SearchResultLanguages, error) { 334 var ( 335 indexerQuery query.Query 336 keywordQuery query.Query 337 ) 338 339 if isMatch { 340 prefixQuery := bleve.NewPrefixQuery(keyword) 341 prefixQuery.FieldVal = "Content" 342 keywordQuery = prefixQuery 343 } else { 344 phraseQuery := bleve.NewMatchPhraseQuery(keyword) 345 phraseQuery.FieldVal = "Content" 346 phraseQuery.Analyzer = repoIndexerAnalyzer 347 keywordQuery = phraseQuery 348 } 349 350 if len(repoIDs) > 0 { 351 repoQueries := make([]query.Query, 0, len(repoIDs)) 352 for _, repoID := range repoIDs { 353 repoQueries = append(repoQueries, numericEqualityQuery(repoID, "RepoID")) 354 } 355 356 indexerQuery = bleve.NewConjunctionQuery( 357 bleve.NewDisjunctionQuery(repoQueries...), 358 keywordQuery, 359 ) 360 } else { 361 indexerQuery = keywordQuery 362 } 363 364 // Save for reuse without language filter 365 facetQuery := indexerQuery 366 if len(language) > 0 { 367 languageQuery := bleve.NewMatchQuery(language) 368 languageQuery.FieldVal = "Language" 369 languageQuery.Analyzer = analyzer_keyword.Name 370 371 indexerQuery = bleve.NewConjunctionQuery( 372 indexerQuery, 373 languageQuery, 374 ) 375 } 376 377 from := (page - 1) * pageSize 378 searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false) 379 searchRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"} 380 searchRequest.IncludeLocations = true 381 382 if len(language) == 0 { 383 searchRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10)) 384 } 385 386 result, err := b.indexer.SearchInContext(ctx, searchRequest) 387 if err != nil { 388 return 0, nil, nil, err 389 } 390 391 total := int64(result.Total) 392 393 searchResults := make([]*SearchResult, len(result.Hits)) 394 for i, hit := range result.Hits { 395 startIndex, endIndex := -1, -1 396 for _, locations := range hit.Locations["Content"] { 397 location := locations[0] 398 locationStart := int(location.Start) 399 locationEnd := int(location.End) 400 if startIndex < 0 || locationStart < startIndex { 401 startIndex = locationStart 402 } 403 if endIndex < 0 || locationEnd > endIndex { 404 endIndex = locationEnd 405 } 406 } 407 language := hit.Fields["Language"].(string) 408 var updatedUnix timeutil.TimeStamp 409 if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil { 410 updatedUnix = timeutil.TimeStamp(t.Unix()) 411 } 412 searchResults[i] = &SearchResult{ 413 RepoID: int64(hit.Fields["RepoID"].(float64)), 414 StartIndex: startIndex, 415 EndIndex: endIndex, 416 Filename: filenameOfIndexerID(hit.ID), 417 Content: hit.Fields["Content"].(string), 418 CommitID: hit.Fields["CommitID"].(string), 419 UpdatedUnix: updatedUnix, 420 Language: language, 421 Color: enry.GetColor(language), 422 } 423 } 424 425 searchResultLanguages := make([]*SearchResultLanguages, 0, 10) 426 if len(language) > 0 { 427 // Use separate query to go get all language counts 428 facetRequest := bleve.NewSearchRequestOptions(facetQuery, 1, 0, false) 429 facetRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"} 430 facetRequest.IncludeLocations = true 431 facetRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10)) 432 433 if result, err = b.indexer.Search(facetRequest); err != nil { 434 return 0, nil, nil, err 435 } 436 437 } 438 languagesFacet := result.Facets["languages"] 439 for _, term := range languagesFacet.Terms.Terms() { 440 if len(term.Term) == 0 { 441 continue 442 } 443 searchResultLanguages = append(searchResultLanguages, &SearchResultLanguages{ 444 Language: term.Term, 445 Color: enry.GetColor(term.Term), 446 Count: term.Count, 447 }) 448 } 449 return total, searchResults, searchResultLanguages, nil 450 }