eintopf.info@v0.13.16/service/search/search.go (about)

     1  // Copyright (C) 2022 The Eintopf authors
     2  //
     3  // This program is free software: you can redistribute it and/or modify
     4  // it under the terms of the GNU Affero General Public License as
     5  // published by the Free Software Foundation, either version 3 of the
     6  // License, or (at your option) any later version.
     7  //
     8  // This program is distributed in the hope that it will be useful,
     9  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    10  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    11  // GNU Affero General Public License for more details.
    12  //
    13  // You should have received a copy of the GNU Affero General Public License
    14  // along with this program.  If not, see <https://www.gnu.org/licenses/>.
    15  
    16  package search
    17  
    18  import (
    19  	"context"
    20  	"encoding/json"
    21  	"fmt"
    22  	"log"
    23  	"os"
    24  	"strconv"
    25  	"strings"
    26  	"sync"
    27  	"time"
    28  
    29  	"eintopf.info/internal/cache"
    30  	"github.com/blevesearch/bleve/v2"
    31  	"github.com/blevesearch/bleve/v2/analysis/lang/de"
    32  	"github.com/blevesearch/bleve/v2/mapping"
    33  	"github.com/blevesearch/bleve/v2/search/query"
    34  )
    35  
    36  // Indexable defines a document, that can be indexed.
    37  // Any model, that implements this interface can be indexed and therefore is
    38  // searchable.
    39  // An indexable document can be uniquely identifyed by combining its identifier
    40  // and type.
    41  type Indexable interface {
    42  	// Identifier returns an id which should uniquely identify the object for
    43  	// its type.
    44  	Identifier() string
    45  
    46  	// Type returns the type of the object.
    47  	Type() string
    48  
    49  	// QueryText returns the string to index for a text search.
    50  	QueryText() string
    51  
    52  	// SearchFields returns a map of additional fields to be indexed. Those
    53  	// fields can be used for filtering or aggregations.
    54  	SearchFields() map[string]interface{}
    55  }
    56  
    57  // Service defines a search service.
    58  //
    59  // -go:generate go run github.com/petergtz/pegomock/pegomock generate eintopf.info/service/search Service --output=../../internal/mock/search_service.go --package=mock --mock-name=SearchService
    60  type Service interface {
    61  	// Index takes one or many indexable document and adds them to the search index.
    62  	Index(docs ...Indexable) error
    63  
    64  	// Delete deletes the document with the given type and id.
    65  	// Note: both type and id have to be provided in order to uniquely identify
    66  	// the document.
    67  	Delete(docType, id string) error
    68  
    69  	// Search performes a full text search on all indexed documents.
    70  	// Performes a wildcard match for empty queries.
    71  	// In addition to the search query a set of additional search options can be
    72  	// provided.
    73  	Search(ctx context.Context, opts *Options) (*Result, error)
    74  
    75  	// LastModified returns the last time, the store was updated. This can be
    76  	// used to invalidate a client side cache.
    77  	LastModified() time.Time
    78  
    79  	// Stops the search service.
    80  	Stop()
    81  }
    82  
    83  // Options defines a set of optional search options.
    84  type Options struct {
    85  	// Query is the search query used for a text search.
    86  	Query string `json:"query"`
    87  
    88  	// Sort is the field, that should be sorted by.
    89  	// When left empty, the default sorting is used.
    90  	Sort string `json:"sort"`
    91  
    92  	// SortDescending defines the sort order.
    93  	SortDescending bool `json:"sortAscending"`
    94  
    95  	// Page is current page.
    96  	Page int `json:"page"`
    97  
    98  	// PageSize defines the number of hits returned per page.
    99  	//
   100  	// PageSize is infinite when set to 0.
   101  	PageSize int `json:"pageSize"`
   102  
   103  	// Filters is a list of filters, that reduce the search result. All filters
   104  	// are combined with AND logic in addition with the search query.
   105  	Filters []Filter `json:"filter"`
   106  
   107  	// Aggregations is a map of aggregations, to perform aggregations on fields.
   108  	// The provided map key can be used to identify the corresponding bucket in
   109  	// the result.
   110  	Aggregations map[string]Aggregation `json:"aggregations"`
   111  }
   112  
   113  // CacheKey returns a string uniquely identifying the Options object.
   114  func (o *Options) CacheKey() string {
   115  	key := o.Query + o.Sort + strconv.FormatBool(o.SortDescending) + strconv.Itoa(o.Page) + strconv.Itoa(o.Page)
   116  	for _, f := range o.Filters {
   117  		key += f.CacheKey()
   118  	}
   119  	for k, a := range o.Aggregations {
   120  		key += k + a.CacheKey()
   121  	}
   122  	return key
   123  }
   124  
   125  // Result contains a search result.
   126  type Result struct {
   127  	// Hits are the search hits for the current pagination.
   128  	Hits []Hit `json:"hits"`
   129  
   130  	// Total is the total number of search hits.
   131  	// It is independet of the current pagination.
   132  	Total uint64 `json:"total"`
   133  
   134  	// Buckets is a set of aggregation buckets.
   135  	// The map key corresponds to aggregation name.
   136  	Buckets map[string]Bucket `json:"buckets"`
   137  }
   138  
   139  // Hit is a single search hit.
   140  type Hit struct {
   141  	// ID is the unique identifier of the stored entity. It might not match the
   142  	// id of the entity in case it was indexed multiple times.
   143  	ID string `json:"id"`
   144  	// Type is the type of the document.
   145  	Type string `json:"type"`
   146  	// Raw contains the raw data of the document in the form of a json string.
   147  	Raw string `json:"raw"`
   148  }
   149  
   150  // Unmarshal unmarshals the raw data into v using json.Unmarshal.
   151  func (h *Hit) Unmarshal(v interface{}) error {
   152  	return json.Unmarshal([]byte(h.Raw), v)
   153  }
   154  
   155  // NewService returns a new search service.
   156  // Takes the path to the index directory. If the index already exits, the
   157  // existing index is used. Otherwise a new one will be created.
   158  func NewService(indexPath string, searchTimeout time.Duration, resultCacheSize int, bucketCacheSize int, tz *time.Location) (Service, error) {
   159  	var index bleve.Index
   160  
   161  	// Check if an index already exists at the specified index path. Use the
   162  	// existing index, if it exists.
   163  	if _, err := os.Stat(indexPath); !os.IsNotExist(err) {
   164  		index, err = bleve.Open(indexPath)
   165  		if err != nil {
   166  			return nil, err
   167  		}
   168  	} else {
   169  		// Create a simple index mapping, containing a single document mapping.
   170  		mapping := bleve.NewIndexMapping()
   171  		mapping.DefaultType = "doc"
   172  		mapping.AddDocumentMapping("doc", documentMapping())
   173  
   174  		index, err = bleve.New(indexPath, mapping)
   175  		if err != nil {
   176  			return nil, err
   177  		}
   178  	}
   179  
   180  	return &service{
   181  		tz:              tz,
   182  		lastStoreUpdate: time.Now(),
   183  		index:           index,
   184  		searchTimeout:   searchTimeout,
   185  		cache:           cache.NewFavoritesCache[*Result](resultCacheSize),
   186  		bucketCache:     cache.NewFavoritesCache[Bucket](bucketCacheSize),
   187  	}, nil
   188  }
   189  
   190  // service is an implementation of the Service interface using bleve search. It
   191  // is internal to hide implementation details.
   192  type service struct {
   193  	tz *time.Location
   194  
   195  	// lastUpdate stores the time of the last store update.
   196  	lastStoreUpdate  time.Time
   197  	mLastStoreUpdate sync.Mutex
   198  
   199  	// index is the bleve index. The index is kept simple, by only storing one
   200  	// data structure (document).
   201  	index bleve.Index
   202  
   203  	// searchTimeout is the maximum duration a search request may take.
   204  	// When the request takes longer, it gets canceled.
   205  	searchTimeout time.Duration
   206  
   207  	// cache stores search result.
   208  	//
   209  	// The cache gets cleared, when an index or delete operation is performed
   210  	cache *cache.Favorites[*Result]
   211  	// bucketCache stores buckets resulting from a unique aggregation. The cache
   212  	// key is retrieved from the CacheKey method on the aggregation.
   213  	//
   214  	// The cache gets cleared, when an index or delete operation is performed.
   215  	bucketCache *cache.Favorites[Bucket]
   216  }
   217  
   218  // document is the internal data structure, that gets indexed into bleve.
   219  // This document provides a structure for the index data, while beeing generic
   220  // enough to allow multiple data types, text quering and filters and
   221  // aggregations on specified fields.
   222  type document struct {
   223  	// Type is the external type of the document.
   224  	Type string `json:"type"`
   225  	// Raw stores the actual document in a json marshaled form.
   226  	Raw string `json:"raw"`
   227  	// Query is a special field allowing text queries.
   228  	Query string `json:"query"`
   229  	// Fields holds a set of fields, that can be used for filtering or
   230  	// aggregations.
   231  	Fields map[string]interface{} `json:"fields"`
   232  }
   233  
   234  // documentMapping returns the bleve document mapping for the document data
   235  // structure.
   236  func documentMapping() *mapping.DocumentMapping {
   237  	queryMapping := bleve.NewTextFieldMapping()
   238  	queryMapping.Analyzer = de.AnalyzerName
   239  
   240  	m := bleve.NewDocumentMapping()
   241  	m.AddFieldMappingsAt("type", bleve.NewKeywordFieldMapping())
   242  	m.AddFieldMappingsAt("query", queryMapping)
   243  	m.AddFieldMappingsAt("raw", &mapping.FieldMapping{
   244  		Type:               "text",
   245  		Store:              true,
   246  		Index:              false,
   247  		IncludeTermVectors: false,
   248  		IncludeInAll:       false,
   249  		DocValues:          false,
   250  	})
   251  	m.AddSubDocumentMapping("fields", bleve.NewDocumentMapping())
   252  
   253  	return m
   254  }
   255  
   256  func fieldKey(field string) string {
   257  	if field == "type" {
   258  		return field
   259  	}
   260  	return "fields." + field
   261  }
   262  
   263  func uniqueID(docType string, id string) string {
   264  	return docType + "_" + id
   265  }
   266  
   267  // Index takes an indexable document and converts it into the internal document
   268  // data structure, which then gets indexed into the bleve index.
   269  //
   270  // Returns an error if the document cannot be marshaled into a json string.
   271  // Returns an error if the bleve index operation failed.
   272  func (s *service) Index(docs ...Indexable) error {
   273  	batch := s.index.NewBatch()
   274  	for _, doc := range docs {
   275  		raw, err := json.Marshal(doc)
   276  		if err != nil {
   277  			return err
   278  		}
   279  		batch.Index(uniqueID(doc.Type(), doc.Identifier()), document{
   280  			Type:   doc.Type(),
   281  			Query:  doc.QueryText(),
   282  			Raw:    string(raw),
   283  			Fields: doc.SearchFields(),
   284  		})
   285  	}
   286  	err := s.index.Batch(batch)
   287  	if err != nil {
   288  		return err
   289  	}
   290  
   291  	s.cache.Clear()
   292  	s.bucketCache.Clear()
   293  	s.updateLastStoreUpdate()
   294  
   295  	return nil
   296  }
   297  
   298  // Delete removes the document from the bleve index.
   299  func (s *service) Delete(docType string, id string) error {
   300  	err := s.index.Delete(uniqueID(docType, id))
   301  	if err != nil {
   302  		return err
   303  	}
   304  
   305  	s.cache.Clear()
   306  	s.bucketCache.Clear()
   307  	s.updateLastStoreUpdate()
   308  
   309  	return nil
   310  }
   311  
   312  func (s *service) updateLastStoreUpdate() {
   313  	s.mLastStoreUpdate.Lock()
   314  	s.lastStoreUpdate = time.Now()
   315  	s.mLastStoreUpdate.Unlock()
   316  }
   317  
   318  // Search perfomes a search request on the bleve index.
   319  func (s *service) Search(ctx context.Context, opts *Options) (*Result, error) {
   320  	ctx, cancel := context.WithTimeout(ctx, s.searchTimeout)
   321  	defer cancel()
   322  
   323  	if opts == nil {
   324  		opts = &Options{}
   325  	}
   326  	if result, ok := s.cache.Get(opts.CacheKey()); ok {
   327  		return result, nil
   328  	}
   329  
   330  	sRequest := buildSearchRequest(opts.Query, opts.Filters)
   331  	if opts.PageSize > 0 {
   332  		// If the page size is larger than 0, paginate the result according to
   333  		// the page and page size.
   334  		sRequest.From = opts.Page * opts.PageSize
   335  		sRequest.Size = opts.PageSize
   336  	} else {
   337  		sRequest.Size = 100000
   338  	}
   339  
   340  	// Only retrieve the "raw" and "type" fields.
   341  	sRequest.Fields = []string{"raw", "type"}
   342  
   343  	if opts.Sort != "" {
   344  		sort := fieldKey(opts.Sort)
   345  		if opts.SortDescending {
   346  			sort = fmt.Sprintf("-%s", sort)
   347  		}
   348  		sRequest.SortBy([]string{sort})
   349  	}
   350  
   351  	type searchResult struct {
   352  		err   error
   353  		hits  []Hit
   354  		total uint64
   355  	}
   356  	sChan := make(chan searchResult)
   357  	go func() {
   358  		result, err := s.index.SearchInContext(ctx, sRequest)
   359  		if err != nil {
   360  			sChan <- searchResult{err: err}
   361  			return
   362  		}
   363  		hits := make([]Hit, 0, len(result.Hits))
   364  		for _, hit := range result.Hits {
   365  			typ := hit.Fields["type"].(string)
   366  			hits = append(hits, Hit{
   367  				ID:   strings.TrimPrefix(hit.ID, fmt.Sprintf("%s_", typ)),
   368  				Type: typ,
   369  				Raw:  hit.Fields["raw"].(string),
   370  			})
   371  		}
   372  		sChan <- searchResult{hits: hits, total: result.Total}
   373  	}()
   374  
   375  	// Perform a seperate search request per aggregation. This enables
   376  	// independet aggregations.
   377  	buckets := make(map[string]Bucket, len(opts.Aggregations))
   378  	bucketsM := sync.Mutex{}
   379  	bucketsWG := sync.WaitGroup{}
   380  
   381  	for name, aggregation := range opts.Aggregations {
   382  		bucketsWG.Add(1)
   383  		go func(name string, aggregation Aggregation) {
   384  			defer bucketsWG.Done()
   385  
   386  			// Check if the bucket resulting in this aggregation is cached.
   387  			cacheKey := opts.Query + aggregation.CacheKey()
   388  			if bucket, ok := s.bucketCache.Get(cacheKey); ok && bucket != nil {
   389  				bucketsM.Lock()
   390  				defer bucketsM.Unlock()
   391  
   392  				buckets[name] = bucket
   393  				return
   394  			}
   395  			bucket, err := s.aggregate(ctx, opts.Query, aggregation)
   396  			if err != nil {
   397  				// Log the error instead of returning it. This makes sure the search
   398  				// doesn't fail, if a search request has an invalid aggregation.
   399  				log.Printf("aggregate: %s: %s\n", name, err)
   400  			}
   401  			s.bucketCache.Set(cacheKey, bucket)
   402  
   403  			bucketsM.Lock()
   404  			defer bucketsM.Unlock()
   405  			buckets[name] = bucket
   406  		}(name, aggregation)
   407  	}
   408  
   409  	sResult := <-sChan
   410  	if sResult.err != nil {
   411  		return nil, sResult.err
   412  	}
   413  	bucketsWG.Wait()
   414  
   415  	result := &Result{Hits: sResult.hits, Total: sResult.total, Buckets: buckets}
   416  
   417  	s.cache.Set(opts.CacheKey(), result)
   418  
   419  	return result, nil
   420  }
   421  
   422  func (s *service) aggregate(ctx context.Context, queryString string, aggregation Aggregation) (Bucket, error) {
   423  	search := buildSearchRequest(queryString, aggregation.Filters)
   424  	search.Fields = []string{fieldKey(aggregation.Field)}
   425  	search.Size = 10000
   426  	result, err := s.index.SearchInContext(ctx, search)
   427  	if err != nil {
   428  		return nil, fmt.Errorf("aggregate: %s", err)
   429  	}
   430  
   431  	var aggregator aggregator
   432  	switch aggregation.Type {
   433  	case TermsAggregation:
   434  		aggregator = &termsAggregator{terms: make(map[string]int)}
   435  	case DateRangeAggregation:
   436  		aggregator = &dateRangeAggregator{
   437  			min: time.Date(9999, 0, 0, 0, 0, 0, 0, s.tz),
   438  			max: time.Unix(0, 0),
   439  		}
   440  	default:
   441  		return nil, fmt.Errorf("invalid aggregation type: %s", aggregation.Type)
   442  	}
   443  	for _, hit := range result.Hits {
   444  		field, ok := hit.Fields[fieldKey(aggregation.Field)]
   445  		if !ok {
   446  			continue
   447  		}
   448  		err := aggregator.aggregate(field)
   449  		if err != nil {
   450  			return nil, fmt.Errorf("aggregagte(%s): %s", field, err)
   451  		}
   452  	}
   453  	return aggregator.bucket(), nil
   454  }
   455  
   456  // buildSearchRequest builds a bleve.SearchRequest from a query string and a set
   457  // of filters.
   458  // All filters and the query string query are combined with AND logic.
   459  // For the query string a match query gets created. If the query string is
   460  // empty, a wildcard query gets created.
   461  func buildSearchRequest(queryString string, filters []Filter) *bleve.SearchRequest {
   462  	query := buildQueryStringQuery(queryString)
   463  
   464  	if len(filters) > 0 {
   465  		boolQuery := bleve.NewBooleanQuery()
   466  		boolQuery.AddMust(query)
   467  		for _, filter := range filters {
   468  			if f := filter.filterQuery(); f != nil {
   469  				boolQuery.AddMust(f)
   470  			}
   471  		}
   472  		query = boolQuery
   473  	}
   474  	return bleve.NewSearchRequest(query)
   475  }
   476  
   477  func buildQueryStringQuery(queryString string) query.Query {
   478  	switch queryString {
   479  	case "":
   480  		wildcardQuery := bleve.NewWildcardQuery("*")
   481  		wildcardQuery.SetField("query")
   482  		return wildcardQuery
   483  	default:
   484  		matchQuery := bleve.NewMatchQuery(queryString)
   485  		matchQuery.SetField("query")
   486  		return matchQuery
   487  	}
   488  }
   489  
   490  func (s *service) LastModified() time.Time {
   491  	return s.lastStoreUpdate
   492  }
   493  
   494  func (s *service) Stop() {
   495  	s.index.Close()
   496  }