code.gitea.io/gitea@v1.22.3/modules/indexer/code/elasticsearch/elasticsearch.go (about) 1 // Copyright 2020 The Gitea Authors. All rights reserved. 2 // SPDX-License-Identifier: MIT 3 4 package elasticsearch 5 6 import ( 7 "bufio" 8 "context" 9 "fmt" 10 "io" 11 "strconv" 12 "strings" 13 14 repo_model "code.gitea.io/gitea/models/repo" 15 "code.gitea.io/gitea/modules/analyze" 16 "code.gitea.io/gitea/modules/charset" 17 "code.gitea.io/gitea/modules/git" 18 "code.gitea.io/gitea/modules/gitrepo" 19 "code.gitea.io/gitea/modules/indexer/code/internal" 20 indexer_internal "code.gitea.io/gitea/modules/indexer/internal" 21 inner_elasticsearch "code.gitea.io/gitea/modules/indexer/internal/elasticsearch" 22 "code.gitea.io/gitea/modules/json" 23 "code.gitea.io/gitea/modules/log" 24 "code.gitea.io/gitea/modules/setting" 25 "code.gitea.io/gitea/modules/timeutil" 26 "code.gitea.io/gitea/modules/typesniffer" 27 28 "github.com/go-enry/go-enry/v2" 29 "github.com/olivere/elastic/v7" 30 ) 31 32 const ( 33 esRepoIndexerLatestVersion = 1 34 // multi-match-types, currently only 2 types are used 35 // Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types 36 esMultiMatchTypeBestFields = "best_fields" 37 esMultiMatchTypePhrasePrefix = "phrase_prefix" 38 ) 39 40 var _ internal.Indexer = &Indexer{} 41 42 // Indexer implements Indexer interface 43 type Indexer struct { 44 inner *inner_elasticsearch.Indexer 45 indexer_internal.Indexer // do not composite inner_elasticsearch.Indexer directly to avoid exposing too much 46 } 47 48 // NewIndexer creates a new elasticsearch indexer 49 func NewIndexer(url, indexerName string) *Indexer { 50 inner := inner_elasticsearch.NewIndexer(url, indexerName, esRepoIndexerLatestVersion, defaultMapping) 51 indexer := &Indexer{ 52 inner: inner, 53 Indexer: inner, 54 } 55 return indexer 56 } 57 58 const ( 59 defaultMapping = `{ 60 "mappings": { 61 "properties": { 62 "repo_id": { 63 "type": "long", 64 "index": true 65 }, 66 "content": { 67 "type": "text", 68 "term_vector": "with_positions_offsets", 69 "index": true 70 }, 71 "commit_id": { 72 "type": "keyword", 73 "index": true 74 }, 75 "language": { 76 "type": "keyword", 77 "index": true 78 }, 79 "updated_at": { 80 "type": "long", 81 "index": true 82 } 83 } 84 } 85 }` 86 ) 87 88 func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserError, batchReader *bufio.Reader, sha string, update internal.FileUpdate, repo *repo_model.Repository) ([]elastic.BulkableRequest, error) { 89 // Ignore vendored files in code search 90 if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) { 91 return nil, nil 92 } 93 94 size := update.Size 95 var err error 96 if !update.Sized { 97 var stdout string 98 stdout, _, err = git.NewCommand(ctx, "cat-file", "-s").AddDynamicArguments(update.BlobSha).RunStdString(&git.RunOpts{Dir: repo.RepoPath()}) 99 if err != nil { 100 return nil, err 101 } 102 if size, err = strconv.ParseInt(strings.TrimSpace(stdout), 10, 64); err != nil { 103 return nil, fmt.Errorf("misformatted git cat-file output: %w", err) 104 } 105 } 106 107 if size > setting.Indexer.MaxIndexerFileSize { 108 return []elastic.BulkableRequest{b.addDelete(update.Filename, repo)}, nil 109 } 110 111 if _, err := batchWriter.Write([]byte(update.BlobSha + "\n")); err != nil { 112 return nil, err 113 } 114 115 _, _, size, err = git.ReadBatchLine(batchReader) 116 if err != nil { 117 return nil, err 118 } 119 120 fileContents, err := io.ReadAll(io.LimitReader(batchReader, size)) 121 if err != nil { 122 return nil, err 123 } else if !typesniffer.DetectContentType(fileContents).IsText() { 124 // FIXME: UTF-16 files will probably fail here 125 return nil, nil 126 } 127 128 if _, err = batchReader.Discard(1); err != nil { 129 return nil, err 130 } 131 id := internal.FilenameIndexerID(repo.ID, update.Filename) 132 133 return []elastic.BulkableRequest{ 134 elastic.NewBulkIndexRequest(). 135 Index(b.inner.VersionedIndexName()). 136 Id(id). 137 Doc(map[string]any{ 138 "repo_id": repo.ID, 139 "content": string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})), 140 "commit_id": sha, 141 "language": analyze.GetCodeLanguage(update.Filename, fileContents), 142 "updated_at": timeutil.TimeStampNow(), 143 }), 144 }, nil 145 } 146 147 func (b *Indexer) addDelete(filename string, repo *repo_model.Repository) elastic.BulkableRequest { 148 id := internal.FilenameIndexerID(repo.ID, filename) 149 return elastic.NewBulkDeleteRequest(). 150 Index(b.inner.VersionedIndexName()). 151 Id(id) 152 } 153 154 // Index will save the index data 155 func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *internal.RepoChanges) error { 156 reqs := make([]elastic.BulkableRequest, 0) 157 if len(changes.Updates) > 0 { 158 r, err := gitrepo.OpenRepository(ctx, repo) 159 if err != nil { 160 return err 161 } 162 defer r.Close() 163 batch, err := r.NewBatch(ctx) 164 if err != nil { 165 return err 166 } 167 defer batch.Close() 168 169 for _, update := range changes.Updates { 170 updateReqs, err := b.addUpdate(ctx, batch.Writer, batch.Reader, sha, update, repo) 171 if err != nil { 172 return err 173 } 174 if len(updateReqs) > 0 { 175 reqs = append(reqs, updateReqs...) 176 } 177 } 178 batch.Close() 179 } 180 181 for _, filename := range changes.RemovedFilenames { 182 reqs = append(reqs, b.addDelete(filename, repo)) 183 } 184 185 if len(reqs) > 0 { 186 esBatchSize := 50 187 188 for i := 0; i < len(reqs); i += esBatchSize { 189 _, err := b.inner.Client.Bulk(). 190 Index(b.inner.VersionedIndexName()). 191 Add(reqs[i:min(i+esBatchSize, len(reqs))]...). 192 Do(ctx) 193 if err != nil { 194 return err 195 } 196 } 197 } 198 return nil 199 } 200 201 // Delete entries by repoId 202 func (b *Indexer) Delete(ctx context.Context, repoID int64) error { 203 if err := b.doDelete(ctx, repoID); err != nil { 204 // Maybe there is a conflict during the delete operation, so we should retry after a refresh 205 log.Warn("Deletion of entries of repo %v within index %v was erroneus. Trying to refresh index before trying again", repoID, b.inner.VersionedIndexName(), err) 206 if err := b.refreshIndex(ctx); err != nil { 207 return err 208 } 209 if err := b.doDelete(ctx, repoID); err != nil { 210 log.Error("Could not delete entries of repo %v within index %v", repoID, b.inner.VersionedIndexName()) 211 return err 212 } 213 } 214 return nil 215 } 216 217 func (b *Indexer) refreshIndex(ctx context.Context) error { 218 if _, err := b.inner.Client.Refresh(b.inner.VersionedIndexName()).Do(ctx); err != nil { 219 log.Error("Error while trying to refresh index %v", b.inner.VersionedIndexName(), err) 220 return err 221 } 222 223 return nil 224 } 225 226 // Delete entries by repoId 227 func (b *Indexer) doDelete(ctx context.Context, repoID int64) error { 228 _, err := b.inner.Client.DeleteByQuery(b.inner.VersionedIndexName()). 229 Query(elastic.NewTermsQuery("repo_id", repoID)). 230 Do(ctx) 231 return err 232 } 233 234 // indexPos find words positions for start and the following end on content. It will 235 // return the beginning position of the first start and the ending position of the 236 // first end following the start string. 237 // If not found any of the positions, it will return -1, -1. 238 func indexPos(content, start, end string) (int, int) { 239 startIdx := strings.Index(content, start) 240 if startIdx < 0 { 241 return -1, -1 242 } 243 endIdx := strings.Index(content[startIdx+len(start):], end) 244 if endIdx < 0 { 245 return -1, -1 246 } 247 return startIdx, startIdx + len(start) + endIdx + len(end) 248 } 249 250 func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) { 251 hits := make([]*internal.SearchResult, 0, pageSize) 252 for _, hit := range searchResult.Hits.Hits { 253 // FIXME: There is no way to get the position the keyword on the content currently on the same request. 254 // So we get it from content, this may made the query slower. See 255 // https://discuss.elastic.co/t/fetching-position-of-keyword-in-matched-document/94291 256 var startIndex, endIndex int 257 c, ok := hit.Highlight["content"] 258 if ok && len(c) > 0 { 259 // FIXME: Since the highlighting content will include <em> and </em> for the keywords, 260 // now we should find the positions. But how to avoid html content which contains the 261 // <em> and </em> tags? If elastic search has handled that? 262 startIndex, endIndex = indexPos(c[0], "<em>", "</em>") 263 if startIndex == -1 { 264 panic(fmt.Sprintf("1===%s,,,%#v,,,%s", kw, hit.Highlight, c[0])) 265 } 266 } else { 267 panic(fmt.Sprintf("2===%#v", hit.Highlight)) 268 } 269 270 repoID, fileName := internal.ParseIndexerID(hit.Id) 271 res := make(map[string]any) 272 if err := json.Unmarshal(hit.Source, &res); err != nil { 273 return 0, nil, nil, err 274 } 275 276 language := res["language"].(string) 277 278 hits = append(hits, &internal.SearchResult{ 279 RepoID: repoID, 280 Filename: fileName, 281 CommitID: res["commit_id"].(string), 282 Content: res["content"].(string), 283 UpdatedUnix: timeutil.TimeStamp(res["updated_at"].(float64)), 284 Language: language, 285 StartIndex: startIndex, 286 EndIndex: endIndex - 9, // remove the length <em></em> since we give Content the original data 287 Color: enry.GetColor(language), 288 }) 289 } 290 291 return searchResult.TotalHits(), hits, extractAggs(searchResult), nil 292 } 293 294 func extractAggs(searchResult *elastic.SearchResult) []*internal.SearchResultLanguages { 295 var searchResultLanguages []*internal.SearchResultLanguages 296 agg, found := searchResult.Aggregations.Terms("language") 297 if found { 298 searchResultLanguages = make([]*internal.SearchResultLanguages, 0, 10) 299 300 for _, bucket := range agg.Buckets { 301 searchResultLanguages = append(searchResultLanguages, &internal.SearchResultLanguages{ 302 Language: bucket.Key.(string), 303 Color: enry.GetColor(bucket.Key.(string)), 304 Count: int(bucket.DocCount), 305 }) 306 } 307 } 308 return searchResultLanguages 309 } 310 311 // Search searches for codes and language stats by given conditions. 312 func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) { 313 searchType := esMultiMatchTypePhrasePrefix 314 if opts.IsKeywordFuzzy { 315 searchType = esMultiMatchTypeBestFields 316 } 317 318 kwQuery := elastic.NewMultiMatchQuery(opts.Keyword, "content").Type(searchType) 319 query := elastic.NewBoolQuery() 320 query = query.Must(kwQuery) 321 if len(opts.RepoIDs) > 0 { 322 repoStrs := make([]any, 0, len(opts.RepoIDs)) 323 for _, repoID := range opts.RepoIDs { 324 repoStrs = append(repoStrs, repoID) 325 } 326 repoQuery := elastic.NewTermsQuery("repo_id", repoStrs...) 327 query = query.Must(repoQuery) 328 } 329 330 var ( 331 start, pageSize = opts.GetSkipTake() 332 kw = "<em>" + opts.Keyword + "</em>" 333 aggregation = elastic.NewTermsAggregation().Field("language").Size(10).OrderByCountDesc() 334 ) 335 336 if len(opts.Language) == 0 { 337 searchResult, err := b.inner.Client.Search(). 338 Index(b.inner.VersionedIndexName()). 339 Aggregation("language", aggregation). 340 Query(query). 341 Highlight( 342 elastic.NewHighlight(). 343 Field("content"). 344 NumOfFragments(0). // return all highting content on fragments 345 HighlighterType("fvh"), 346 ). 347 Sort("repo_id", true). 348 From(start).Size(pageSize). 349 Do(ctx) 350 if err != nil { 351 return 0, nil, nil, err 352 } 353 354 return convertResult(searchResult, kw, pageSize) 355 } 356 357 langQuery := elastic.NewMatchQuery("language", opts.Language) 358 countResult, err := b.inner.Client.Search(). 359 Index(b.inner.VersionedIndexName()). 360 Aggregation("language", aggregation). 361 Query(query). 362 Size(0). // We only need stats information 363 Do(ctx) 364 if err != nil { 365 return 0, nil, nil, err 366 } 367 368 query = query.Must(langQuery) 369 searchResult, err := b.inner.Client.Search(). 370 Index(b.inner.VersionedIndexName()). 371 Query(query). 372 Highlight( 373 elastic.NewHighlight(). 374 Field("content"). 375 NumOfFragments(0). // return all highting content on fragments 376 HighlighterType("fvh"), 377 ). 378 Sort("repo_id", true). 379 From(start).Size(pageSize). 380 Do(ctx) 381 if err != nil { 382 return 0, nil, nil, err 383 } 384 385 total, hits, _, err := convertResult(searchResult, kw, pageSize) 386 387 return total, hits, extractAggs(countResult), err 388 }