github.com/gohugoio/hugo@v0.88.1/related/inverted_index.go (about)

     1  // Copyright 2019 The Hugo Authors. All rights reserved.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  // http://www.apache.org/licenses/LICENSE-2.0
     7  //
     8  // Unless required by applicable law or agreed to in writing, software
     9  // distributed under the License is distributed on an "AS IS" BASIS,
    10  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  // Package related holds code to help finding related content.
    15  package related
    16  
    17  import (
    18  	"errors"
    19  	"fmt"
    20  	"math"
    21  	"sort"
    22  	"strings"
    23  	"time"
    24  
    25  	"github.com/gohugoio/hugo/common/maps"
    26  
    27  	"github.com/gohugoio/hugo/common/types"
    28  	"github.com/mitchellh/mapstructure"
    29  )
    30  
    31  var (
    32  	_        Keyword = (*StringKeyword)(nil)
    33  	zeroDate         = time.Time{}
    34  
    35  	// DefaultConfig is the default related config.
    36  	DefaultConfig = Config{
    37  		Threshold: 80,
    38  		Indices: IndexConfigs{
    39  			IndexConfig{Name: "keywords", Weight: 100},
    40  			IndexConfig{Name: "date", Weight: 10},
    41  		},
    42  	}
    43  )
    44  
    45  /*
    46  Config is the top level configuration element used to configure how to retrieve
    47  related content in Hugo.
    48  
    49  An example site config.toml:
    50  
    51  	[related]
    52  	threshold = 1
    53  	[[related.indices]]
    54  	name = "keywords"
    55  	weight = 200
    56  	[[related.indices]]
    57  	name  = "tags"
    58  	weight = 100
    59  	[[related.indices]]
    60  	name  = "date"
    61  	weight = 1
    62  	pattern = "2006"
    63  */
    64  type Config struct {
    65  	// Only include matches >= threshold, a normalized rank between 0 and 100.
    66  	Threshold int
    67  
    68  	// To get stable "See also" sections we, by default, exclude newer related pages.
    69  	IncludeNewer bool
    70  
    71  	// Will lower case all string values and queries to the indices.
    72  	// May get better results, but at a slight performance cost.
    73  	ToLower bool
    74  
    75  	Indices IndexConfigs
    76  }
    77  
    78  // Add adds a given index.
    79  func (c *Config) Add(index IndexConfig) {
    80  	if c.ToLower {
    81  		index.ToLower = true
    82  	}
    83  	c.Indices = append(c.Indices, index)
    84  }
    85  
    86  // IndexConfigs holds a set of index configurations.
    87  type IndexConfigs []IndexConfig
    88  
    89  // IndexConfig configures an index.
    90  type IndexConfig struct {
    91  	// The index name. This directly maps to a field or Param name.
    92  	Name string
    93  
    94  	// Contextual pattern used to convert the Param value into a string.
    95  	// Currently only used for dates. Can be used to, say, bump posts in the same
    96  	// time frame when searching for related documents.
    97  	// For dates it follows Go's time.Format patterns, i.e.
    98  	// "2006" for YYYY and "200601" for YYYYMM.
    99  	Pattern string
   100  
   101  	// This field's weight when doing multi-index searches. Higher is "better".
   102  	Weight int
   103  
   104  	// Will lower case all string values in and queries tothis index.
   105  	// May get better accurate results, but at a slight performance cost.
   106  	ToLower bool
   107  }
   108  
   109  // Document is the interface an indexable document in Hugo must fulfill.
   110  type Document interface {
   111  	// RelatedKeywords returns a list of keywords for the given index config.
   112  	RelatedKeywords(cfg IndexConfig) ([]Keyword, error)
   113  
   114  	// When this document was or will be published.
   115  	PublishDate() time.Time
   116  
   117  	// Name is used as an tiebreaker if both Weight and PublishDate are
   118  	// the same.
   119  	Name() string
   120  }
   121  
   122  // InvertedIndex holds an inverted index, also sometimes named posting list, which
   123  // lists, for every possible search term, the documents that contain that term.
   124  type InvertedIndex struct {
   125  	cfg   Config
   126  	index map[string]map[Keyword][]Document
   127  
   128  	minWeight int
   129  	maxWeight int
   130  }
   131  
   132  func (idx *InvertedIndex) getIndexCfg(name string) (IndexConfig, bool) {
   133  	for _, conf := range idx.cfg.Indices {
   134  		if conf.Name == name {
   135  			return conf, true
   136  		}
   137  	}
   138  
   139  	return IndexConfig{}, false
   140  }
   141  
   142  // NewInvertedIndex creates a new InvertedIndex.
   143  // Documents to index must be added in Add.
   144  func NewInvertedIndex(cfg Config) *InvertedIndex {
   145  	idx := &InvertedIndex{index: make(map[string]map[Keyword][]Document), cfg: cfg}
   146  	for _, conf := range cfg.Indices {
   147  		idx.index[conf.Name] = make(map[Keyword][]Document)
   148  		if conf.Weight < idx.minWeight {
   149  			// By default, the weight scale starts at 0, but we allow
   150  			// negative weights.
   151  			idx.minWeight = conf.Weight
   152  		}
   153  		if conf.Weight > idx.maxWeight {
   154  			idx.maxWeight = conf.Weight
   155  		}
   156  	}
   157  	return idx
   158  }
   159  
   160  // Add documents to the inverted index.
   161  // The value must support == and !=.
   162  func (idx *InvertedIndex) Add(docs ...Document) error {
   163  	var err error
   164  	for _, config := range idx.cfg.Indices {
   165  		if config.Weight == 0 {
   166  			// Disabled
   167  			continue
   168  		}
   169  		setm := idx.index[config.Name]
   170  
   171  		for _, doc := range docs {
   172  			var words []Keyword
   173  			words, err = doc.RelatedKeywords(config)
   174  			if err != nil {
   175  				continue
   176  			}
   177  
   178  			for _, keyword := range words {
   179  				setm[keyword] = append(setm[keyword], doc)
   180  			}
   181  		}
   182  	}
   183  
   184  	return err
   185  }
   186  
   187  // queryElement holds the index name and keywords that can be used to compose a
   188  // search for related content.
   189  type queryElement struct {
   190  	Index    string
   191  	Keywords []Keyword
   192  }
   193  
   194  func newQueryElement(index string, keywords ...Keyword) queryElement {
   195  	return queryElement{Index: index, Keywords: keywords}
   196  }
   197  
   198  type ranks []*rank
   199  
   200  type rank struct {
   201  	Doc     Document
   202  	Weight  int
   203  	Matches int
   204  }
   205  
   206  func (r *rank) addWeight(w int) {
   207  	r.Weight += w
   208  	r.Matches++
   209  }
   210  
   211  func newRank(doc Document, weight int) *rank {
   212  	return &rank{Doc: doc, Weight: weight, Matches: 1}
   213  }
   214  
   215  func (r ranks) Len() int      { return len(r) }
   216  func (r ranks) Swap(i, j int) { r[i], r[j] = r[j], r[i] }
   217  func (r ranks) Less(i, j int) bool {
   218  	if r[i].Weight == r[j].Weight {
   219  		if r[i].Doc.PublishDate() == r[j].Doc.PublishDate() {
   220  			return r[i].Doc.Name() < r[j].Doc.Name()
   221  		}
   222  		return r[i].Doc.PublishDate().After(r[j].Doc.PublishDate())
   223  	}
   224  	return r[i].Weight > r[j].Weight
   225  }
   226  
   227  // SearchDoc finds the documents matching any of the keywords in the given indices
   228  // against the given document.
   229  // The resulting document set will be sorted according to number of matches
   230  // and the index weights, and any matches with a rank below the configured
   231  // threshold (normalize to 0..100) will be removed.
   232  // If an index name is provided, only that index will be queried.
   233  func (idx *InvertedIndex) SearchDoc(doc Document, indices ...string) ([]Document, error) {
   234  	var q []queryElement
   235  
   236  	var configs IndexConfigs
   237  
   238  	if len(indices) == 0 {
   239  		configs = idx.cfg.Indices
   240  	} else {
   241  		configs = make(IndexConfigs, len(indices))
   242  		for i, indexName := range indices {
   243  			cfg, found := idx.getIndexCfg(indexName)
   244  			if !found {
   245  				return nil, fmt.Errorf("index %q not found", indexName)
   246  			}
   247  			configs[i] = cfg
   248  		}
   249  	}
   250  
   251  	for _, cfg := range configs {
   252  		keywords, err := doc.RelatedKeywords(cfg)
   253  		if err != nil {
   254  			return nil, err
   255  		}
   256  
   257  		q = append(q, newQueryElement(cfg.Name, keywords...))
   258  
   259  	}
   260  
   261  	return idx.searchDate(doc.PublishDate(), q...)
   262  }
   263  
   264  // ToKeywords returns a Keyword slice of the given input.
   265  func (cfg IndexConfig) ToKeywords(v interface{}) ([]Keyword, error) {
   266  	var (
   267  		keywords []Keyword
   268  		toLower  = cfg.ToLower
   269  	)
   270  	switch vv := v.(type) {
   271  	case string:
   272  		if toLower {
   273  			vv = strings.ToLower(vv)
   274  		}
   275  		keywords = append(keywords, StringKeyword(vv))
   276  	case []string:
   277  		if toLower {
   278  			vc := make([]string, len(vv))
   279  			copy(vc, vv)
   280  			for i := 0; i < len(vc); i++ {
   281  				vc[i] = strings.ToLower(vc[i])
   282  			}
   283  			vv = vc
   284  		}
   285  		keywords = append(keywords, StringsToKeywords(vv...)...)
   286  	case time.Time:
   287  		layout := "2006"
   288  		if cfg.Pattern != "" {
   289  			layout = cfg.Pattern
   290  		}
   291  		keywords = append(keywords, StringKeyword(vv.Format(layout)))
   292  	case nil:
   293  		return keywords, nil
   294  	default:
   295  		return keywords, fmt.Errorf("indexing currently not supported for index %q and type %T", cfg.Name, vv)
   296  	}
   297  
   298  	return keywords, nil
   299  }
   300  
   301  // SearchKeyValues finds the documents matching any of the keywords in the given indices.
   302  // The resulting document set will be sorted according to number of matches
   303  // and the index weights, and any matches with a rank below the configured
   304  // threshold (normalize to 0..100) will be removed.
   305  func (idx *InvertedIndex) SearchKeyValues(args ...types.KeyValues) ([]Document, error) {
   306  	q := make([]queryElement, len(args))
   307  
   308  	for i, arg := range args {
   309  		var keywords []Keyword
   310  		key := arg.KeyString()
   311  		if key == "" {
   312  			return nil, fmt.Errorf("index %q not valid", arg.Key)
   313  		}
   314  		conf, found := idx.getIndexCfg(key)
   315  		if !found {
   316  			return nil, fmt.Errorf("index %q not found", key)
   317  		}
   318  
   319  		for _, val := range arg.Values {
   320  			k, err := conf.ToKeywords(val)
   321  			if err != nil {
   322  				return nil, err
   323  			}
   324  			keywords = append(keywords, k...)
   325  		}
   326  
   327  		q[i] = newQueryElement(conf.Name, keywords...)
   328  
   329  	}
   330  
   331  	return idx.search(q...)
   332  }
   333  
   334  func (idx *InvertedIndex) search(query ...queryElement) ([]Document, error) {
   335  	return idx.searchDate(zeroDate, query...)
   336  }
   337  
   338  func (idx *InvertedIndex) searchDate(upperDate time.Time, query ...queryElement) ([]Document, error) {
   339  	matchm := make(map[Document]*rank, 200)
   340  	applyDateFilter := !idx.cfg.IncludeNewer && !upperDate.IsZero()
   341  
   342  	for _, el := range query {
   343  		setm, found := idx.index[el.Index]
   344  		if !found {
   345  			return []Document{}, fmt.Errorf("index for %q not found", el.Index)
   346  		}
   347  
   348  		config, found := idx.getIndexCfg(el.Index)
   349  		if !found {
   350  			return []Document{}, fmt.Errorf("index config for %q not found", el.Index)
   351  		}
   352  
   353  		for _, kw := range el.Keywords {
   354  			if docs, found := setm[kw]; found {
   355  				for _, doc := range docs {
   356  					if applyDateFilter {
   357  						// Exclude newer than the limit given
   358  						if doc.PublishDate().After(upperDate) {
   359  							continue
   360  						}
   361  					}
   362  					r, found := matchm[doc]
   363  					if !found {
   364  						matchm[doc] = newRank(doc, config.Weight)
   365  					} else {
   366  						r.addWeight(config.Weight)
   367  					}
   368  				}
   369  			}
   370  		}
   371  	}
   372  
   373  	if len(matchm) == 0 {
   374  		return []Document{}, nil
   375  	}
   376  
   377  	matches := make(ranks, 0, 100)
   378  
   379  	for _, v := range matchm {
   380  		avgWeight := v.Weight / v.Matches
   381  		weight := norm(avgWeight, idx.minWeight, idx.maxWeight)
   382  		threshold := idx.cfg.Threshold / v.Matches
   383  
   384  		if weight >= threshold {
   385  			matches = append(matches, v)
   386  		}
   387  	}
   388  
   389  	sort.Stable(matches)
   390  
   391  	result := make([]Document, len(matches))
   392  
   393  	for i, m := range matches {
   394  		result[i] = m.Doc
   395  	}
   396  
   397  	return result, nil
   398  }
   399  
   400  // normalizes num to a number between 0 and 100.
   401  func norm(num, min, max int) int {
   402  	if min > max {
   403  		panic("min > max")
   404  	}
   405  	return int(math.Floor((float64(num-min) / float64(max-min) * 100) + 0.5))
   406  }
   407  
   408  // DecodeConfig decodes a slice of map into Config.
   409  func DecodeConfig(m maps.Params) (Config, error) {
   410  	if m == nil {
   411  		return Config{}, errors.New("no related config provided")
   412  	}
   413  
   414  	if len(m) == 0 {
   415  		return Config{}, errors.New("empty related config provided")
   416  	}
   417  
   418  	var c Config
   419  
   420  	if err := mapstructure.WeakDecode(m, &c); err != nil {
   421  		return c, err
   422  	}
   423  
   424  	if c.Threshold < 0 || c.Threshold > 100 {
   425  		return Config{}, errors.New("related threshold must be between 0 and 100")
   426  	}
   427  
   428  	if c.ToLower {
   429  		for i := range c.Indices {
   430  			c.Indices[i].ToLower = true
   431  		}
   432  	}
   433  
   434  	return c, nil
   435  }
   436  
   437  // StringKeyword is a string search keyword.
   438  type StringKeyword string
   439  
   440  func (s StringKeyword) String() string {
   441  	return string(s)
   442  }
   443  
   444  // Keyword is the interface a keyword in the search index must implement.
   445  type Keyword interface {
   446  	String() string
   447  }
   448  
   449  // StringsToKeywords converts the given slice of strings to a slice of Keyword.
   450  func StringsToKeywords(s ...string) []Keyword {
   451  	kw := make([]Keyword, len(s))
   452  
   453  	for i := 0; i < len(s); i++ {
   454  		kw[i] = StringKeyword(s[i])
   455  	}
   456  
   457  	return kw
   458  }