code.gitea.io/gitea@v1.19.3/modules/indexer/code/elastic_search.go (about) 1 // Copyright 2020 The Gitea Authors. All rights reserved. 2 // SPDX-License-Identifier: MIT 3 4 package code 5 6 import ( 7 "bufio" 8 "context" 9 "errors" 10 "fmt" 11 "io" 12 "net" 13 "strconv" 14 "strings" 15 "sync" 16 "time" 17 18 repo_model "code.gitea.io/gitea/models/repo" 19 "code.gitea.io/gitea/modules/analyze" 20 "code.gitea.io/gitea/modules/charset" 21 "code.gitea.io/gitea/modules/git" 22 "code.gitea.io/gitea/modules/graceful" 23 "code.gitea.io/gitea/modules/json" 24 "code.gitea.io/gitea/modules/log" 25 "code.gitea.io/gitea/modules/setting" 26 "code.gitea.io/gitea/modules/timeutil" 27 "code.gitea.io/gitea/modules/typesniffer" 28 29 "github.com/go-enry/go-enry/v2" 30 "github.com/olivere/elastic/v7" 31 ) 32 33 const ( 34 esRepoIndexerLatestVersion = 1 35 // multi-match-types, currently only 2 types are used 36 // Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types 37 esMultiMatchTypeBestFields = "best_fields" 38 esMultiMatchTypePhrasePrefix = "phrase_prefix" 39 ) 40 41 var _ Indexer = &ElasticSearchIndexer{} 42 43 // ElasticSearchIndexer implements Indexer interface 44 type ElasticSearchIndexer struct { 45 client *elastic.Client 46 indexerAliasName string 47 available bool 48 availabilityCallback func(bool) 49 stopTimer chan struct{} 50 lock sync.RWMutex 51 } 52 53 type elasticLogger struct { 54 log.Logger 55 } 56 57 func (l elasticLogger) Printf(format string, args ...interface{}) { 58 _ = l.Logger.Log(2, l.Logger.GetLevel(), format, args...) 59 } 60 61 // NewElasticSearchIndexer creates a new elasticsearch indexer 62 func NewElasticSearchIndexer(url, indexerName string) (*ElasticSearchIndexer, bool, error) { 63 opts := []elastic.ClientOptionFunc{ 64 elastic.SetURL(url), 65 elastic.SetSniff(false), 66 elastic.SetHealthcheckInterval(10 * time.Second), 67 elastic.SetGzip(false), 68 } 69 70 logger := elasticLogger{log.GetLogger(log.DEFAULT)} 71 72 if logger.GetLevel() == log.TRACE || logger.GetLevel() == log.DEBUG { 73 opts = append(opts, elastic.SetTraceLog(logger)) 74 } else if logger.GetLevel() == log.ERROR || logger.GetLevel() == log.CRITICAL || logger.GetLevel() == log.FATAL { 75 opts = append(opts, elastic.SetErrorLog(logger)) 76 } else if logger.GetLevel() == log.INFO || logger.GetLevel() == log.WARN { 77 opts = append(opts, elastic.SetInfoLog(logger)) 78 } 79 80 client, err := elastic.NewClient(opts...) 81 if err != nil { 82 return nil, false, err 83 } 84 85 indexer := &ElasticSearchIndexer{ 86 client: client, 87 indexerAliasName: indexerName, 88 available: true, 89 stopTimer: make(chan struct{}), 90 } 91 92 ticker := time.NewTicker(10 * time.Second) 93 go func() { 94 for { 95 select { 96 case <-ticker.C: 97 indexer.checkAvailability() 98 case <-indexer.stopTimer: 99 ticker.Stop() 100 return 101 } 102 } 103 }() 104 105 exists, err := indexer.init() 106 if err != nil { 107 indexer.Close() 108 return nil, false, err 109 } 110 return indexer, !exists, err 111 } 112 113 const ( 114 defaultMapping = `{ 115 "mappings": { 116 "properties": { 117 "repo_id": { 118 "type": "long", 119 "index": true 120 }, 121 "content": { 122 "type": "text", 123 "term_vector": "with_positions_offsets", 124 "index": true 125 }, 126 "commit_id": { 127 "type": "keyword", 128 "index": true 129 }, 130 "language": { 131 "type": "keyword", 132 "index": true 133 }, 134 "updated_at": { 135 "type": "long", 136 "index": true 137 } 138 } 139 } 140 }` 141 ) 142 143 func (b *ElasticSearchIndexer) realIndexerName() string { 144 return fmt.Sprintf("%s.v%d", b.indexerAliasName, esRepoIndexerLatestVersion) 145 } 146 147 // Init will initialize the indexer 148 func (b *ElasticSearchIndexer) init() (bool, error) { 149 ctx := graceful.GetManager().HammerContext() 150 exists, err := b.client.IndexExists(b.realIndexerName()).Do(ctx) 151 if err != nil { 152 return false, b.checkError(err) 153 } 154 if !exists { 155 mapping := defaultMapping 156 157 createIndex, err := b.client.CreateIndex(b.realIndexerName()).BodyString(mapping).Do(ctx) 158 if err != nil { 159 return false, b.checkError(err) 160 } 161 if !createIndex.Acknowledged { 162 return false, fmt.Errorf("create index %s with %s failed", b.realIndexerName(), mapping) 163 } 164 } 165 166 // check version 167 r, err := b.client.Aliases().Do(ctx) 168 if err != nil { 169 return false, b.checkError(err) 170 } 171 172 realIndexerNames := r.IndicesByAlias(b.indexerAliasName) 173 if len(realIndexerNames) < 1 { 174 res, err := b.client.Alias(). 175 Add(b.realIndexerName(), b.indexerAliasName). 176 Do(ctx) 177 if err != nil { 178 return false, b.checkError(err) 179 } 180 if !res.Acknowledged { 181 return false, fmt.Errorf("create alias %s to index %s failed", b.indexerAliasName, b.realIndexerName()) 182 } 183 } else if len(realIndexerNames) >= 1 && realIndexerNames[0] < b.realIndexerName() { 184 log.Warn("Found older gitea indexer named %s, but we will create a new one %s and keep the old NOT DELETED. You can delete the old version after the upgrade succeed.", 185 realIndexerNames[0], b.realIndexerName()) 186 res, err := b.client.Alias(). 187 Remove(realIndexerNames[0], b.indexerAliasName). 188 Add(b.realIndexerName(), b.indexerAliasName). 189 Do(ctx) 190 if err != nil { 191 return false, b.checkError(err) 192 } 193 if !res.Acknowledged { 194 return false, fmt.Errorf("change alias %s to index %s failed", b.indexerAliasName, b.realIndexerName()) 195 } 196 } 197 198 return exists, nil 199 } 200 201 // SetAvailabilityChangeCallback sets callback that will be triggered when availability changes 202 func (b *ElasticSearchIndexer) SetAvailabilityChangeCallback(callback func(bool)) { 203 b.lock.Lock() 204 defer b.lock.Unlock() 205 b.availabilityCallback = callback 206 } 207 208 // Ping checks if elastic is available 209 func (b *ElasticSearchIndexer) Ping() bool { 210 b.lock.RLock() 211 defer b.lock.RUnlock() 212 return b.available 213 } 214 215 func (b *ElasticSearchIndexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserError, batchReader *bufio.Reader, sha string, update fileUpdate, repo *repo_model.Repository) ([]elastic.BulkableRequest, error) { 216 // Ignore vendored files in code search 217 if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) { 218 return nil, nil 219 } 220 221 size := update.Size 222 var err error 223 if !update.Sized { 224 var stdout string 225 stdout, _, err = git.NewCommand(ctx, "cat-file", "-s").AddDynamicArguments(update.BlobSha).RunStdString(&git.RunOpts{Dir: repo.RepoPath()}) 226 if err != nil { 227 return nil, err 228 } 229 if size, err = strconv.ParseInt(strings.TrimSpace(stdout), 10, 64); err != nil { 230 return nil, fmt.Errorf("misformatted git cat-file output: %w", err) 231 } 232 } 233 234 if size > setting.Indexer.MaxIndexerFileSize { 235 return []elastic.BulkableRequest{b.addDelete(update.Filename, repo)}, nil 236 } 237 238 if _, err := batchWriter.Write([]byte(update.BlobSha + "\n")); err != nil { 239 return nil, err 240 } 241 242 _, _, size, err = git.ReadBatchLine(batchReader) 243 if err != nil { 244 return nil, err 245 } 246 247 fileContents, err := io.ReadAll(io.LimitReader(batchReader, size)) 248 if err != nil { 249 return nil, err 250 } else if !typesniffer.DetectContentType(fileContents).IsText() { 251 // FIXME: UTF-16 files will probably fail here 252 return nil, nil 253 } 254 255 if _, err = batchReader.Discard(1); err != nil { 256 return nil, err 257 } 258 id := filenameIndexerID(repo.ID, update.Filename) 259 260 return []elastic.BulkableRequest{ 261 elastic.NewBulkIndexRequest(). 262 Index(b.indexerAliasName). 263 Id(id). 264 Doc(map[string]interface{}{ 265 "repo_id": repo.ID, 266 "content": string(charset.ToUTF8DropErrors(fileContents)), 267 "commit_id": sha, 268 "language": analyze.GetCodeLanguage(update.Filename, fileContents), 269 "updated_at": timeutil.TimeStampNow(), 270 }), 271 }, nil 272 } 273 274 func (b *ElasticSearchIndexer) addDelete(filename string, repo *repo_model.Repository) elastic.BulkableRequest { 275 id := filenameIndexerID(repo.ID, filename) 276 return elastic.NewBulkDeleteRequest(). 277 Index(b.indexerAliasName). 278 Id(id) 279 } 280 281 // Index will save the index data 282 func (b *ElasticSearchIndexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *repoChanges) error { 283 reqs := make([]elastic.BulkableRequest, 0) 284 if len(changes.Updates) > 0 { 285 // Now because of some insanity with git cat-file not immediately failing if not run in a valid git directory we need to run git rev-parse first! 286 if err := git.EnsureValidGitRepository(ctx, repo.RepoPath()); err != nil { 287 log.Error("Unable to open git repo: %s for %-v: %v", repo.RepoPath(), repo, err) 288 return err 289 } 290 291 batchWriter, batchReader, cancel := git.CatFileBatch(ctx, repo.RepoPath()) 292 defer cancel() 293 294 for _, update := range changes.Updates { 295 updateReqs, err := b.addUpdate(ctx, batchWriter, batchReader, sha, update, repo) 296 if err != nil { 297 return err 298 } 299 if len(updateReqs) > 0 { 300 reqs = append(reqs, updateReqs...) 301 } 302 } 303 cancel() 304 } 305 306 for _, filename := range changes.RemovedFilenames { 307 reqs = append(reqs, b.addDelete(filename, repo)) 308 } 309 310 if len(reqs) > 0 { 311 _, err := b.client.Bulk(). 312 Index(b.indexerAliasName). 313 Add(reqs...). 314 Do(ctx) 315 return b.checkError(err) 316 } 317 return nil 318 } 319 320 // Delete deletes indexes by ids 321 func (b *ElasticSearchIndexer) Delete(repoID int64) error { 322 _, err := b.client.DeleteByQuery(b.indexerAliasName). 323 Query(elastic.NewTermsQuery("repo_id", repoID)). 324 Do(graceful.GetManager().HammerContext()) 325 return b.checkError(err) 326 } 327 328 // indexPos find words positions for start and the following end on content. It will 329 // return the beginning position of the first start and the ending position of the 330 // first end following the start string. 331 // If not found any of the positions, it will return -1, -1. 332 func indexPos(content, start, end string) (int, int) { 333 startIdx := strings.Index(content, start) 334 if startIdx < 0 { 335 return -1, -1 336 } 337 endIdx := strings.Index(content[startIdx+len(start):], end) 338 if endIdx < 0 { 339 return -1, -1 340 } 341 return startIdx, startIdx + len(start) + endIdx + len(end) 342 } 343 344 func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) (int64, []*SearchResult, []*SearchResultLanguages, error) { 345 hits := make([]*SearchResult, 0, pageSize) 346 for _, hit := range searchResult.Hits.Hits { 347 // FIXME: There is no way to get the position the keyword on the content currently on the same request. 348 // So we get it from content, this may made the query slower. See 349 // https://discuss.elastic.co/t/fetching-position-of-keyword-in-matched-document/94291 350 var startIndex, endIndex int 351 c, ok := hit.Highlight["content"] 352 if ok && len(c) > 0 { 353 // FIXME: Since the highlighting content will include <em> and </em> for the keywords, 354 // now we should find the positions. But how to avoid html content which contains the 355 // <em> and </em> tags? If elastic search has handled that? 356 startIndex, endIndex = indexPos(c[0], "<em>", "</em>") 357 if startIndex == -1 { 358 panic(fmt.Sprintf("1===%s,,,%#v,,,%s", kw, hit.Highlight, c[0])) 359 } 360 } else { 361 panic(fmt.Sprintf("2===%#v", hit.Highlight)) 362 } 363 364 repoID, fileName := parseIndexerID(hit.Id) 365 res := make(map[string]interface{}) 366 if err := json.Unmarshal(hit.Source, &res); err != nil { 367 return 0, nil, nil, err 368 } 369 370 language := res["language"].(string) 371 372 hits = append(hits, &SearchResult{ 373 RepoID: repoID, 374 Filename: fileName, 375 CommitID: res["commit_id"].(string), 376 Content: res["content"].(string), 377 UpdatedUnix: timeutil.TimeStamp(res["updated_at"].(float64)), 378 Language: language, 379 StartIndex: startIndex, 380 EndIndex: endIndex - 9, // remove the length <em></em> since we give Content the original data 381 Color: enry.GetColor(language), 382 }) 383 } 384 385 return searchResult.TotalHits(), hits, extractAggs(searchResult), nil 386 } 387 388 func extractAggs(searchResult *elastic.SearchResult) []*SearchResultLanguages { 389 var searchResultLanguages []*SearchResultLanguages 390 agg, found := searchResult.Aggregations.Terms("language") 391 if found { 392 searchResultLanguages = make([]*SearchResultLanguages, 0, 10) 393 394 for _, bucket := range agg.Buckets { 395 searchResultLanguages = append(searchResultLanguages, &SearchResultLanguages{ 396 Language: bucket.Key.(string), 397 Color: enry.GetColor(bucket.Key.(string)), 398 Count: int(bucket.DocCount), 399 }) 400 } 401 } 402 return searchResultLanguages 403 } 404 405 // Search searches for codes and language stats by given conditions. 406 func (b *ElasticSearchIndexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*SearchResult, []*SearchResultLanguages, error) { 407 searchType := esMultiMatchTypeBestFields 408 if isMatch { 409 searchType = esMultiMatchTypePhrasePrefix 410 } 411 412 kwQuery := elastic.NewMultiMatchQuery(keyword, "content").Type(searchType) 413 query := elastic.NewBoolQuery() 414 query = query.Must(kwQuery) 415 if len(repoIDs) > 0 { 416 repoStrs := make([]interface{}, 0, len(repoIDs)) 417 for _, repoID := range repoIDs { 418 repoStrs = append(repoStrs, repoID) 419 } 420 repoQuery := elastic.NewTermsQuery("repo_id", repoStrs...) 421 query = query.Must(repoQuery) 422 } 423 424 var ( 425 start int 426 kw = "<em>" + keyword + "</em>" 427 aggregation = elastic.NewTermsAggregation().Field("language").Size(10).OrderByCountDesc() 428 ) 429 430 if page > 0 { 431 start = (page - 1) * pageSize 432 } 433 434 if len(language) == 0 { 435 searchResult, err := b.client.Search(). 436 Index(b.indexerAliasName). 437 Aggregation("language", aggregation). 438 Query(query). 439 Highlight( 440 elastic.NewHighlight(). 441 Field("content"). 442 NumOfFragments(0). // return all highting content on fragments 443 HighlighterType("fvh"), 444 ). 445 Sort("repo_id", true). 446 From(start).Size(pageSize). 447 Do(ctx) 448 if err != nil { 449 return 0, nil, nil, b.checkError(err) 450 } 451 452 return convertResult(searchResult, kw, pageSize) 453 } 454 455 langQuery := elastic.NewMatchQuery("language", language) 456 countResult, err := b.client.Search(). 457 Index(b.indexerAliasName). 458 Aggregation("language", aggregation). 459 Query(query). 460 Size(0). // We only needs stats information 461 Do(ctx) 462 if err != nil { 463 return 0, nil, nil, b.checkError(err) 464 } 465 466 query = query.Must(langQuery) 467 searchResult, err := b.client.Search(). 468 Index(b.indexerAliasName). 469 Query(query). 470 Highlight( 471 elastic.NewHighlight(). 472 Field("content"). 473 NumOfFragments(0). // return all highting content on fragments 474 HighlighterType("fvh"), 475 ). 476 Sort("repo_id", true). 477 From(start).Size(pageSize). 478 Do(ctx) 479 if err != nil { 480 return 0, nil, nil, b.checkError(err) 481 } 482 483 total, hits, _, err := convertResult(searchResult, kw, pageSize) 484 485 return total, hits, extractAggs(countResult), err 486 } 487 488 // Close implements indexer 489 func (b *ElasticSearchIndexer) Close() { 490 select { 491 case <-b.stopTimer: 492 default: 493 close(b.stopTimer) 494 } 495 } 496 497 func (b *ElasticSearchIndexer) checkError(err error) error { 498 var opErr *net.OpError 499 if !(elastic.IsConnErr(err) || (errors.As(err, &opErr) && (opErr.Op == "dial" || opErr.Op == "read"))) { 500 return err 501 } 502 503 b.setAvailability(false) 504 505 return err 506 } 507 508 func (b *ElasticSearchIndexer) checkAvailability() { 509 if b.Ping() { 510 return 511 } 512 513 // Request cluster state to check if elastic is available again 514 _, err := b.client.ClusterState().Do(graceful.GetManager().ShutdownContext()) 515 if err != nil { 516 b.setAvailability(false) 517 return 518 } 519 520 b.setAvailability(true) 521 } 522 523 func (b *ElasticSearchIndexer) setAvailability(available bool) { 524 b.lock.Lock() 525 defer b.lock.Unlock() 526 527 if b.available == available { 528 return 529 } 530 531 b.available = available 532 if b.availabilityCallback != nil { 533 // Call the callback from within the lock to ensure that the ordering remains correct 534 b.availabilityCallback(b.available) 535 } 536 }