github.com/schumacherfm/hugo@v0.47.1/related/inverted_index.go (about)

     1  // Copyright 2017-present The Hugo Authors. All rights reserved.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  // http://www.apache.org/licenses/LICENSE-2.0
     7  //
     8  // Unless required by applicable law or agreed to in writing, software
     9  // distributed under the License is distributed on an "AS IS" BASIS,
    10  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  // Package related holds code to help finding related content.
    15  package related
    16  
    17  import (
    18  	"errors"
    19  	"fmt"
    20  	"math"
    21  	"sort"
    22  	"strings"
    23  	"time"
    24  
    25  	"github.com/gohugoio/hugo/common/types"
    26  	"github.com/mitchellh/mapstructure"
    27  )
    28  
    29  var (
    30  	_        Keyword = (*StringKeyword)(nil)
    31  	zeroDate         = time.Time{}
    32  
    33  	// DefaultConfig is the default related config.
    34  	DefaultConfig = Config{
    35  		Threshold: 80,
    36  		Indices: IndexConfigs{
    37  			IndexConfig{Name: "keywords", Weight: 100},
    38  			IndexConfig{Name: "date", Weight: 10},
    39  		},
    40  	}
    41  )
    42  
    43  /*
    44  Config is the top level configuration element used to configure how to retrieve
    45  related content in Hugo.
    46  
    47  An example site config.toml:
    48  
    49  	[related]
    50  	threshold = 1
    51  	[[related.indices]]
    52  	name = "keywords"
    53  	weight = 200
    54  	[[related.indices]]
    55  	name  = "tags"
    56  	weight = 100
    57  	[[related.indices]]
    58  	name  = "date"
    59  	weight = 1
    60  	pattern = "2006"
    61  */
    62  type Config struct {
    63  	// Only include matches >= threshold, a normalized rank between 0 and 100.
    64  	Threshold int
    65  
    66  	// To get stable "See also" sections we, by default, exclude newer related pages.
    67  	IncludeNewer bool
    68  
    69  	// Will lower case all string values and queries to the indices.
    70  	// May get better results, but at a slight performance cost.
    71  	ToLower bool
    72  
    73  	Indices IndexConfigs
    74  }
    75  
    76  func (c *Config) Add(index IndexConfig) {
    77  	if c.ToLower {
    78  		index.ToLower = true
    79  	}
    80  	c.Indices = append(c.Indices, index)
    81  }
    82  
    83  // IndexConfigs holds a set of index configurations.
    84  type IndexConfigs []IndexConfig
    85  
    86  // IndexConfig configures an index.
    87  type IndexConfig struct {
    88  	// The index name. This directly maps to a field or Param name.
    89  	Name string
    90  
    91  	// Contextual pattern used to convert the Param value into a string.
    92  	// Currently only used for dates. Can be used to, say, bump posts in the same
    93  	// time frame when searching for related documents.
    94  	// For dates it follows Go's time.Format patterns, i.e.
    95  	// "2006" for YYYY and "200601" for YYYYMM.
    96  	Pattern string
    97  
    98  	// This field's weight when doing multi-index searches. Higher is "better".
    99  	Weight int
   100  
   101  	// Will lower case all string values in and queries tothis index.
   102  	// May get better accurate results, but at a slight performance cost.
   103  	ToLower bool
   104  }
   105  
   106  // Document is the interface an indexable document in Hugo must fulfill.
   107  type Document interface {
   108  	// SearchKeywords returns a list of keywords for the given index config.
   109  	SearchKeywords(cfg IndexConfig) ([]Keyword, error)
   110  
   111  	// When this document was or will be published.
   112  	PubDate() time.Time
   113  }
   114  
   115  // InvertedIndex holds an inverted index, also sometimes named posting list, which
   116  // lists, for every possible search term, the documents that contain that term.
   117  type InvertedIndex struct {
   118  	cfg   Config
   119  	index map[string]map[Keyword][]Document
   120  
   121  	minWeight int
   122  	maxWeight int
   123  }
   124  
   125  func (idx *InvertedIndex) getIndexCfg(name string) (IndexConfig, bool) {
   126  	for _, conf := range idx.cfg.Indices {
   127  		if conf.Name == name {
   128  			return conf, true
   129  		}
   130  	}
   131  
   132  	return IndexConfig{}, false
   133  }
   134  
   135  // NewInvertedIndex creates a new InvertedIndex.
   136  // Documents to index must be added in Add.
   137  func NewInvertedIndex(cfg Config) *InvertedIndex {
   138  	idx := &InvertedIndex{index: make(map[string]map[Keyword][]Document), cfg: cfg}
   139  	for _, conf := range cfg.Indices {
   140  		idx.index[conf.Name] = make(map[Keyword][]Document)
   141  		if conf.Weight < idx.minWeight {
   142  			// By default, the weight scale starts at 0, but we allow
   143  			// negative weights.
   144  			idx.minWeight = conf.Weight
   145  		}
   146  		if conf.Weight > idx.maxWeight {
   147  			idx.maxWeight = conf.Weight
   148  		}
   149  	}
   150  	return idx
   151  }
   152  
   153  // Add documents to the inverted index.
   154  // The value must support == and !=.
   155  func (idx *InvertedIndex) Add(docs ...Document) error {
   156  	var err error
   157  	for _, config := range idx.cfg.Indices {
   158  		if config.Weight == 0 {
   159  			// Disabled
   160  			continue
   161  		}
   162  		setm := idx.index[config.Name]
   163  
   164  		for _, doc := range docs {
   165  			var words []Keyword
   166  			words, err = doc.SearchKeywords(config)
   167  			if err != nil {
   168  				continue
   169  			}
   170  
   171  			for _, keyword := range words {
   172  				setm[keyword] = append(setm[keyword], doc)
   173  			}
   174  		}
   175  	}
   176  
   177  	return err
   178  
   179  }
   180  
   181  // queryElement holds the index name and keywords that can be used to compose a
   182  // search for related content.
   183  type queryElement struct {
   184  	Index    string
   185  	Keywords []Keyword
   186  }
   187  
   188  func newQueryElement(index string, keywords ...Keyword) queryElement {
   189  	return queryElement{Index: index, Keywords: keywords}
   190  }
   191  
   192  type ranks []*rank
   193  
   194  type rank struct {
   195  	Doc     Document
   196  	Weight  int
   197  	Matches int
   198  }
   199  
   200  func (r *rank) addWeight(w int) {
   201  	r.Weight += w
   202  	r.Matches++
   203  }
   204  
   205  func newRank(doc Document, weight int) *rank {
   206  	return &rank{Doc: doc, Weight: weight, Matches: 1}
   207  }
   208  
   209  func (r ranks) Len() int      { return len(r) }
   210  func (r ranks) Swap(i, j int) { r[i], r[j] = r[j], r[i] }
   211  func (r ranks) Less(i, j int) bool {
   212  	if r[i].Weight == r[j].Weight {
   213  		return r[i].Doc.PubDate().After(r[j].Doc.PubDate())
   214  	}
   215  	return r[i].Weight > r[j].Weight
   216  }
   217  
   218  // SearchDoc finds the documents matching any of the keywords in the given indices
   219  // against the given document.
   220  // The resulting document set will be sorted according to number of matches
   221  // and the index weights, and any matches with a rank below the configured
   222  // threshold (normalize to 0..100) will be removed.
   223  // If an index name is provided, only that index will be queried.
   224  func (idx *InvertedIndex) SearchDoc(doc Document, indices ...string) ([]Document, error) {
   225  	var q []queryElement
   226  
   227  	var configs IndexConfigs
   228  
   229  	if len(indices) == 0 {
   230  		configs = idx.cfg.Indices
   231  	} else {
   232  		configs = make(IndexConfigs, len(indices))
   233  		for i, indexName := range indices {
   234  			cfg, found := idx.getIndexCfg(indexName)
   235  			if !found {
   236  				return nil, fmt.Errorf("index %q not found", indexName)
   237  			}
   238  			configs[i] = cfg
   239  		}
   240  	}
   241  
   242  	for _, cfg := range configs {
   243  		keywords, err := doc.SearchKeywords(cfg)
   244  		if err != nil {
   245  			return nil, err
   246  		}
   247  
   248  		q = append(q, newQueryElement(cfg.Name, keywords...))
   249  
   250  	}
   251  
   252  	return idx.searchDate(doc.PubDate(), q...)
   253  }
   254  
   255  func (cfg IndexConfig) ToKeywords(v interface{}) ([]Keyword, error) {
   256  	var (
   257  		keywords []Keyword
   258  		toLower  = cfg.ToLower
   259  	)
   260  	switch vv := v.(type) {
   261  	case string:
   262  		if toLower {
   263  			vv = strings.ToLower(vv)
   264  		}
   265  		keywords = append(keywords, StringKeyword(vv))
   266  	case []string:
   267  		if toLower {
   268  			for i := 0; i < len(vv); i++ {
   269  				vv[i] = strings.ToLower(vv[i])
   270  			}
   271  		}
   272  		keywords = append(keywords, StringsToKeywords(vv...)...)
   273  	case time.Time:
   274  		layout := "2006"
   275  		if cfg.Pattern != "" {
   276  			layout = cfg.Pattern
   277  		}
   278  		keywords = append(keywords, StringKeyword(vv.Format(layout)))
   279  	case nil:
   280  		return keywords, nil
   281  	default:
   282  		return keywords, fmt.Errorf("indexing currently not supported for for index %q and type %T", cfg.Name, vv)
   283  	}
   284  
   285  	return keywords, nil
   286  }
   287  
   288  // SearchKeyValues finds the documents matching any of the keywords in the given indices.
   289  // The resulting document set will be sorted according to number of matches
   290  // and the index weights, and any matches with a rank below the configured
   291  // threshold (normalize to 0..100) will be removed.
   292  func (idx *InvertedIndex) SearchKeyValues(args ...types.KeyValues) ([]Document, error) {
   293  	q := make([]queryElement, len(args))
   294  
   295  	for i, arg := range args {
   296  		var keywords []Keyword
   297  		key := arg.KeyString()
   298  		if key == "" {
   299  			return nil, fmt.Errorf("index %q not valid", arg.Key)
   300  		}
   301  		conf, found := idx.getIndexCfg(key)
   302  		if !found {
   303  			return nil, fmt.Errorf("index %q not found", key)
   304  		}
   305  
   306  		for _, val := range arg.Values {
   307  			k, err := conf.ToKeywords(val)
   308  			if err != nil {
   309  				return nil, err
   310  			}
   311  			keywords = append(keywords, k...)
   312  		}
   313  
   314  		q[i] = newQueryElement(conf.Name, keywords...)
   315  
   316  	}
   317  
   318  	return idx.search(q...)
   319  }
   320  
   321  func (idx *InvertedIndex) search(query ...queryElement) ([]Document, error) {
   322  	return idx.searchDate(zeroDate, query...)
   323  }
   324  
   325  func (idx *InvertedIndex) searchDate(upperDate time.Time, query ...queryElement) ([]Document, error) {
   326  	matchm := make(map[Document]*rank, 200)
   327  	applyDateFilter := !idx.cfg.IncludeNewer && !upperDate.IsZero()
   328  
   329  	for _, el := range query {
   330  		setm, found := idx.index[el.Index]
   331  		if !found {
   332  			return []Document{}, fmt.Errorf("index for %q not found", el.Index)
   333  		}
   334  
   335  		config, found := idx.getIndexCfg(el.Index)
   336  		if !found {
   337  			return []Document{}, fmt.Errorf("index config for %q not found", el.Index)
   338  		}
   339  
   340  		for _, kw := range el.Keywords {
   341  			if docs, found := setm[kw]; found {
   342  				for _, doc := range docs {
   343  					if applyDateFilter {
   344  						// Exclude newer than the limit given
   345  						if doc.PubDate().After(upperDate) {
   346  							continue
   347  						}
   348  					}
   349  					r, found := matchm[doc]
   350  					if !found {
   351  						matchm[doc] = newRank(doc, config.Weight)
   352  					} else {
   353  						r.addWeight(config.Weight)
   354  					}
   355  				}
   356  			}
   357  		}
   358  	}
   359  
   360  	if len(matchm) == 0 {
   361  		return []Document{}, nil
   362  	}
   363  
   364  	matches := make(ranks, 0, 100)
   365  
   366  	for _, v := range matchm {
   367  		avgWeight := v.Weight / v.Matches
   368  		weight := norm(avgWeight, idx.minWeight, idx.maxWeight)
   369  		threshold := idx.cfg.Threshold / v.Matches
   370  
   371  		if weight >= threshold {
   372  			matches = append(matches, v)
   373  		}
   374  	}
   375  
   376  	sort.Stable(matches)
   377  
   378  	result := make([]Document, len(matches))
   379  
   380  	for i, m := range matches {
   381  		result[i] = m.Doc
   382  	}
   383  
   384  	return result, nil
   385  }
   386  
   387  // normalizes num to a number between 0 and 100.
   388  func norm(num, min, max int) int {
   389  	if min > max {
   390  		panic("min > max")
   391  	}
   392  	return int(math.Floor((float64(num-min) / float64(max-min) * 100) + 0.5))
   393  }
   394  
   395  // DecodeConfig decodes a slice of map into Config.
   396  func DecodeConfig(in interface{}) (Config, error) {
   397  	if in == nil {
   398  		return Config{}, errors.New("no related config provided")
   399  	}
   400  
   401  	m, ok := in.(map[string]interface{})
   402  	if !ok {
   403  		return Config{}, fmt.Errorf("expected map[string]interface {} got %T", in)
   404  	}
   405  
   406  	if len(m) == 0 {
   407  		return Config{}, errors.New("empty related config provided")
   408  	}
   409  
   410  	var c Config
   411  
   412  	if err := mapstructure.WeakDecode(m, &c); err != nil {
   413  		return c, err
   414  	}
   415  
   416  	if c.Threshold < 0 || c.Threshold > 100 {
   417  		return Config{}, errors.New("related threshold must be between 0 and 100")
   418  	}
   419  
   420  	if c.ToLower {
   421  		for i := range c.Indices {
   422  			c.Indices[i].ToLower = true
   423  		}
   424  	}
   425  
   426  	return c, nil
   427  }
   428  
   429  // StringKeyword is a string search keyword.
   430  type StringKeyword string
   431  
   432  func (s StringKeyword) String() string {
   433  	return string(s)
   434  }
   435  
   436  // Keyword is the interface a keyword in the search index must implement.
   437  type Keyword interface {
   438  	String() string
   439  }
   440  
   441  // StringsToKeywords converts the given slice of strings to a slice of Keyword.
   442  func StringsToKeywords(s ...string) []Keyword {
   443  	kw := make([]Keyword, len(s))
   444  
   445  	for i := 0; i < len(s); i++ {
   446  		kw[i] = StringKeyword(s[i])
   447  	}
   448  
   449  	return kw
   450  }