eintopf.info@v0.13.16/service/search/search.go (about) 1 // Copyright (C) 2022 The Eintopf authors 2 // 3 // This program is free software: you can redistribute it and/or modify 4 // it under the terms of the GNU Affero General Public License as 5 // published by the Free Software Foundation, either version 3 of the 6 // License, or (at your option) any later version. 7 // 8 // This program is distributed in the hope that it will be useful, 9 // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 // GNU Affero General Public License for more details. 12 // 13 // You should have received a copy of the GNU Affero General Public License 14 // along with this program. If not, see <https://www.gnu.org/licenses/>. 15 16 package search 17 18 import ( 19 "context" 20 "encoding/json" 21 "fmt" 22 "log" 23 "os" 24 "strconv" 25 "strings" 26 "sync" 27 "time" 28 29 "eintopf.info/internal/cache" 30 "github.com/blevesearch/bleve/v2" 31 "github.com/blevesearch/bleve/v2/analysis/lang/de" 32 "github.com/blevesearch/bleve/v2/mapping" 33 "github.com/blevesearch/bleve/v2/search/query" 34 ) 35 36 // Indexable defines a document, that can be indexed. 37 // Any model, that implements this interface can be indexed and therefore is 38 // searchable. 39 // An indexable document can be uniquely identifyed by combining its identifier 40 // and type. 41 type Indexable interface { 42 // Identifier returns an id which should uniquely identify the object for 43 // its type. 44 Identifier() string 45 46 // Type returns the type of the object. 47 Type() string 48 49 // QueryText returns the string to index for a text search. 50 QueryText() string 51 52 // SearchFields returns a map of additional fields to be indexed. Those 53 // fields can be used for filtering or aggregations. 54 SearchFields() map[string]interface{} 55 } 56 57 // Service defines a search service. 58 // 59 // -go:generate go run github.com/petergtz/pegomock/pegomock generate eintopf.info/service/search Service --output=../../internal/mock/search_service.go --package=mock --mock-name=SearchService 60 type Service interface { 61 // Index takes one or many indexable document and adds them to the search index. 62 Index(docs ...Indexable) error 63 64 // Delete deletes the document with the given type and id. 65 // Note: both type and id have to be provided in order to uniquely identify 66 // the document. 67 Delete(docType, id string) error 68 69 // Search performes a full text search on all indexed documents. 70 // Performes a wildcard match for empty queries. 71 // In addition to the search query a set of additional search options can be 72 // provided. 73 Search(ctx context.Context, opts *Options) (*Result, error) 74 75 // LastModified returns the last time, the store was updated. This can be 76 // used to invalidate a client side cache. 77 LastModified() time.Time 78 79 // Stops the search service. 80 Stop() 81 } 82 83 // Options defines a set of optional search options. 84 type Options struct { 85 // Query is the search query used for a text search. 86 Query string `json:"query"` 87 88 // Sort is the field, that should be sorted by. 89 // When left empty, the default sorting is used. 90 Sort string `json:"sort"` 91 92 // SortDescending defines the sort order. 93 SortDescending bool `json:"sortAscending"` 94 95 // Page is current page. 96 Page int `json:"page"` 97 98 // PageSize defines the number of hits returned per page. 99 // 100 // PageSize is infinite when set to 0. 101 PageSize int `json:"pageSize"` 102 103 // Filters is a list of filters, that reduce the search result. All filters 104 // are combined with AND logic in addition with the search query. 105 Filters []Filter `json:"filter"` 106 107 // Aggregations is a map of aggregations, to perform aggregations on fields. 108 // The provided map key can be used to identify the corresponding bucket in 109 // the result. 110 Aggregations map[string]Aggregation `json:"aggregations"` 111 } 112 113 // CacheKey returns a string uniquely identifying the Options object. 114 func (o *Options) CacheKey() string { 115 key := o.Query + o.Sort + strconv.FormatBool(o.SortDescending) + strconv.Itoa(o.Page) + strconv.Itoa(o.Page) 116 for _, f := range o.Filters { 117 key += f.CacheKey() 118 } 119 for k, a := range o.Aggregations { 120 key += k + a.CacheKey() 121 } 122 return key 123 } 124 125 // Result contains a search result. 126 type Result struct { 127 // Hits are the search hits for the current pagination. 128 Hits []Hit `json:"hits"` 129 130 // Total is the total number of search hits. 131 // It is independet of the current pagination. 132 Total uint64 `json:"total"` 133 134 // Buckets is a set of aggregation buckets. 135 // The map key corresponds to aggregation name. 136 Buckets map[string]Bucket `json:"buckets"` 137 } 138 139 // Hit is a single search hit. 140 type Hit struct { 141 // ID is the unique identifier of the stored entity. It might not match the 142 // id of the entity in case it was indexed multiple times. 143 ID string `json:"id"` 144 // Type is the type of the document. 145 Type string `json:"type"` 146 // Raw contains the raw data of the document in the form of a json string. 147 Raw string `json:"raw"` 148 } 149 150 // Unmarshal unmarshals the raw data into v using json.Unmarshal. 151 func (h *Hit) Unmarshal(v interface{}) error { 152 return json.Unmarshal([]byte(h.Raw), v) 153 } 154 155 // NewService returns a new search service. 156 // Takes the path to the index directory. If the index already exits, the 157 // existing index is used. Otherwise a new one will be created. 158 func NewService(indexPath string, searchTimeout time.Duration, resultCacheSize int, bucketCacheSize int, tz *time.Location) (Service, error) { 159 var index bleve.Index 160 161 // Check if an index already exists at the specified index path. Use the 162 // existing index, if it exists. 163 if _, err := os.Stat(indexPath); !os.IsNotExist(err) { 164 index, err = bleve.Open(indexPath) 165 if err != nil { 166 return nil, err 167 } 168 } else { 169 // Create a simple index mapping, containing a single document mapping. 170 mapping := bleve.NewIndexMapping() 171 mapping.DefaultType = "doc" 172 mapping.AddDocumentMapping("doc", documentMapping()) 173 174 index, err = bleve.New(indexPath, mapping) 175 if err != nil { 176 return nil, err 177 } 178 } 179 180 return &service{ 181 tz: tz, 182 lastStoreUpdate: time.Now(), 183 index: index, 184 searchTimeout: searchTimeout, 185 cache: cache.NewFavoritesCache[*Result](resultCacheSize), 186 bucketCache: cache.NewFavoritesCache[Bucket](bucketCacheSize), 187 }, nil 188 } 189 190 // service is an implementation of the Service interface using bleve search. It 191 // is internal to hide implementation details. 192 type service struct { 193 tz *time.Location 194 195 // lastUpdate stores the time of the last store update. 196 lastStoreUpdate time.Time 197 mLastStoreUpdate sync.Mutex 198 199 // index is the bleve index. The index is kept simple, by only storing one 200 // data structure (document). 201 index bleve.Index 202 203 // searchTimeout is the maximum duration a search request may take. 204 // When the request takes longer, it gets canceled. 205 searchTimeout time.Duration 206 207 // cache stores search result. 208 // 209 // The cache gets cleared, when an index or delete operation is performed 210 cache *cache.Favorites[*Result] 211 // bucketCache stores buckets resulting from a unique aggregation. The cache 212 // key is retrieved from the CacheKey method on the aggregation. 213 // 214 // The cache gets cleared, when an index or delete operation is performed. 215 bucketCache *cache.Favorites[Bucket] 216 } 217 218 // document is the internal data structure, that gets indexed into bleve. 219 // This document provides a structure for the index data, while beeing generic 220 // enough to allow multiple data types, text quering and filters and 221 // aggregations on specified fields. 222 type document struct { 223 // Type is the external type of the document. 224 Type string `json:"type"` 225 // Raw stores the actual document in a json marshaled form. 226 Raw string `json:"raw"` 227 // Query is a special field allowing text queries. 228 Query string `json:"query"` 229 // Fields holds a set of fields, that can be used for filtering or 230 // aggregations. 231 Fields map[string]interface{} `json:"fields"` 232 } 233 234 // documentMapping returns the bleve document mapping for the document data 235 // structure. 236 func documentMapping() *mapping.DocumentMapping { 237 queryMapping := bleve.NewTextFieldMapping() 238 queryMapping.Analyzer = de.AnalyzerName 239 240 m := bleve.NewDocumentMapping() 241 m.AddFieldMappingsAt("type", bleve.NewKeywordFieldMapping()) 242 m.AddFieldMappingsAt("query", queryMapping) 243 m.AddFieldMappingsAt("raw", &mapping.FieldMapping{ 244 Type: "text", 245 Store: true, 246 Index: false, 247 IncludeTermVectors: false, 248 IncludeInAll: false, 249 DocValues: false, 250 }) 251 m.AddSubDocumentMapping("fields", bleve.NewDocumentMapping()) 252 253 return m 254 } 255 256 func fieldKey(field string) string { 257 if field == "type" { 258 return field 259 } 260 return "fields." + field 261 } 262 263 func uniqueID(docType string, id string) string { 264 return docType + "_" + id 265 } 266 267 // Index takes an indexable document and converts it into the internal document 268 // data structure, which then gets indexed into the bleve index. 269 // 270 // Returns an error if the document cannot be marshaled into a json string. 271 // Returns an error if the bleve index operation failed. 272 func (s *service) Index(docs ...Indexable) error { 273 batch := s.index.NewBatch() 274 for _, doc := range docs { 275 raw, err := json.Marshal(doc) 276 if err != nil { 277 return err 278 } 279 batch.Index(uniqueID(doc.Type(), doc.Identifier()), document{ 280 Type: doc.Type(), 281 Query: doc.QueryText(), 282 Raw: string(raw), 283 Fields: doc.SearchFields(), 284 }) 285 } 286 err := s.index.Batch(batch) 287 if err != nil { 288 return err 289 } 290 291 s.cache.Clear() 292 s.bucketCache.Clear() 293 s.updateLastStoreUpdate() 294 295 return nil 296 } 297 298 // Delete removes the document from the bleve index. 299 func (s *service) Delete(docType string, id string) error { 300 err := s.index.Delete(uniqueID(docType, id)) 301 if err != nil { 302 return err 303 } 304 305 s.cache.Clear() 306 s.bucketCache.Clear() 307 s.updateLastStoreUpdate() 308 309 return nil 310 } 311 312 func (s *service) updateLastStoreUpdate() { 313 s.mLastStoreUpdate.Lock() 314 s.lastStoreUpdate = time.Now() 315 s.mLastStoreUpdate.Unlock() 316 } 317 318 // Search perfomes a search request on the bleve index. 319 func (s *service) Search(ctx context.Context, opts *Options) (*Result, error) { 320 ctx, cancel := context.WithTimeout(ctx, s.searchTimeout) 321 defer cancel() 322 323 if opts == nil { 324 opts = &Options{} 325 } 326 if result, ok := s.cache.Get(opts.CacheKey()); ok { 327 return result, nil 328 } 329 330 sRequest := buildSearchRequest(opts.Query, opts.Filters) 331 if opts.PageSize > 0 { 332 // If the page size is larger than 0, paginate the result according to 333 // the page and page size. 334 sRequest.From = opts.Page * opts.PageSize 335 sRequest.Size = opts.PageSize 336 } else { 337 sRequest.Size = 100000 338 } 339 340 // Only retrieve the "raw" and "type" fields. 341 sRequest.Fields = []string{"raw", "type"} 342 343 if opts.Sort != "" { 344 sort := fieldKey(opts.Sort) 345 if opts.SortDescending { 346 sort = fmt.Sprintf("-%s", sort) 347 } 348 sRequest.SortBy([]string{sort}) 349 } 350 351 type searchResult struct { 352 err error 353 hits []Hit 354 total uint64 355 } 356 sChan := make(chan searchResult) 357 go func() { 358 result, err := s.index.SearchInContext(ctx, sRequest) 359 if err != nil { 360 sChan <- searchResult{err: err} 361 return 362 } 363 hits := make([]Hit, 0, len(result.Hits)) 364 for _, hit := range result.Hits { 365 typ := hit.Fields["type"].(string) 366 hits = append(hits, Hit{ 367 ID: strings.TrimPrefix(hit.ID, fmt.Sprintf("%s_", typ)), 368 Type: typ, 369 Raw: hit.Fields["raw"].(string), 370 }) 371 } 372 sChan <- searchResult{hits: hits, total: result.Total} 373 }() 374 375 // Perform a seperate search request per aggregation. This enables 376 // independet aggregations. 377 buckets := make(map[string]Bucket, len(opts.Aggregations)) 378 bucketsM := sync.Mutex{} 379 bucketsWG := sync.WaitGroup{} 380 381 for name, aggregation := range opts.Aggregations { 382 bucketsWG.Add(1) 383 go func(name string, aggregation Aggregation) { 384 defer bucketsWG.Done() 385 386 // Check if the bucket resulting in this aggregation is cached. 387 cacheKey := opts.Query + aggregation.CacheKey() 388 if bucket, ok := s.bucketCache.Get(cacheKey); ok && bucket != nil { 389 bucketsM.Lock() 390 defer bucketsM.Unlock() 391 392 buckets[name] = bucket 393 return 394 } 395 bucket, err := s.aggregate(ctx, opts.Query, aggregation) 396 if err != nil { 397 // Log the error instead of returning it. This makes sure the search 398 // doesn't fail, if a search request has an invalid aggregation. 399 log.Printf("aggregate: %s: %s\n", name, err) 400 } 401 s.bucketCache.Set(cacheKey, bucket) 402 403 bucketsM.Lock() 404 defer bucketsM.Unlock() 405 buckets[name] = bucket 406 }(name, aggregation) 407 } 408 409 sResult := <-sChan 410 if sResult.err != nil { 411 return nil, sResult.err 412 } 413 bucketsWG.Wait() 414 415 result := &Result{Hits: sResult.hits, Total: sResult.total, Buckets: buckets} 416 417 s.cache.Set(opts.CacheKey(), result) 418 419 return result, nil 420 } 421 422 func (s *service) aggregate(ctx context.Context, queryString string, aggregation Aggregation) (Bucket, error) { 423 search := buildSearchRequest(queryString, aggregation.Filters) 424 search.Fields = []string{fieldKey(aggregation.Field)} 425 search.Size = 10000 426 result, err := s.index.SearchInContext(ctx, search) 427 if err != nil { 428 return nil, fmt.Errorf("aggregate: %s", err) 429 } 430 431 var aggregator aggregator 432 switch aggregation.Type { 433 case TermsAggregation: 434 aggregator = &termsAggregator{terms: make(map[string]int)} 435 case DateRangeAggregation: 436 aggregator = &dateRangeAggregator{ 437 min: time.Date(9999, 0, 0, 0, 0, 0, 0, s.tz), 438 max: time.Unix(0, 0), 439 } 440 default: 441 return nil, fmt.Errorf("invalid aggregation type: %s", aggregation.Type) 442 } 443 for _, hit := range result.Hits { 444 field, ok := hit.Fields[fieldKey(aggregation.Field)] 445 if !ok { 446 continue 447 } 448 err := aggregator.aggregate(field) 449 if err != nil { 450 return nil, fmt.Errorf("aggregagte(%s): %s", field, err) 451 } 452 } 453 return aggregator.bucket(), nil 454 } 455 456 // buildSearchRequest builds a bleve.SearchRequest from a query string and a set 457 // of filters. 458 // All filters and the query string query are combined with AND logic. 459 // For the query string a match query gets created. If the query string is 460 // empty, a wildcard query gets created. 461 func buildSearchRequest(queryString string, filters []Filter) *bleve.SearchRequest { 462 query := buildQueryStringQuery(queryString) 463 464 if len(filters) > 0 { 465 boolQuery := bleve.NewBooleanQuery() 466 boolQuery.AddMust(query) 467 for _, filter := range filters { 468 if f := filter.filterQuery(); f != nil { 469 boolQuery.AddMust(f) 470 } 471 } 472 query = boolQuery 473 } 474 return bleve.NewSearchRequest(query) 475 } 476 477 func buildQueryStringQuery(queryString string) query.Query { 478 switch queryString { 479 case "": 480 wildcardQuery := bleve.NewWildcardQuery("*") 481 wildcardQuery.SetField("query") 482 return wildcardQuery 483 default: 484 matchQuery := bleve.NewMatchQuery(queryString) 485 matchQuery.SetField("query") 486 return matchQuery 487 } 488 } 489 490 func (s *service) LastModified() time.Time { 491 return s.lastStoreUpdate 492 } 493 494 func (s *service) Stop() { 495 s.index.Close() 496 }