github.com/schumacherfm/hugo@v0.47.1/related/inverted_index.go (about) 1 // Copyright 2017-present The Hugo Authors. All rights reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // http://www.apache.org/licenses/LICENSE-2.0 7 // 8 // Unless required by applicable law or agreed to in writing, software 9 // distributed under the License is distributed on an "AS IS" BASIS, 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 // Package related holds code to help finding related content. 15 package related 16 17 import ( 18 "errors" 19 "fmt" 20 "math" 21 "sort" 22 "strings" 23 "time" 24 25 "github.com/gohugoio/hugo/common/types" 26 "github.com/mitchellh/mapstructure" 27 ) 28 29 var ( 30 _ Keyword = (*StringKeyword)(nil) 31 zeroDate = time.Time{} 32 33 // DefaultConfig is the default related config. 34 DefaultConfig = Config{ 35 Threshold: 80, 36 Indices: IndexConfigs{ 37 IndexConfig{Name: "keywords", Weight: 100}, 38 IndexConfig{Name: "date", Weight: 10}, 39 }, 40 } 41 ) 42 43 /* 44 Config is the top level configuration element used to configure how to retrieve 45 related content in Hugo. 46 47 An example site config.toml: 48 49 [related] 50 threshold = 1 51 [[related.indices]] 52 name = "keywords" 53 weight = 200 54 [[related.indices]] 55 name = "tags" 56 weight = 100 57 [[related.indices]] 58 name = "date" 59 weight = 1 60 pattern = "2006" 61 */ 62 type Config struct { 63 // Only include matches >= threshold, a normalized rank between 0 and 100. 64 Threshold int 65 66 // To get stable "See also" sections we, by default, exclude newer related pages. 67 IncludeNewer bool 68 69 // Will lower case all string values and queries to the indices. 70 // May get better results, but at a slight performance cost. 71 ToLower bool 72 73 Indices IndexConfigs 74 } 75 76 func (c *Config) Add(index IndexConfig) { 77 if c.ToLower { 78 index.ToLower = true 79 } 80 c.Indices = append(c.Indices, index) 81 } 82 83 // IndexConfigs holds a set of index configurations. 84 type IndexConfigs []IndexConfig 85 86 // IndexConfig configures an index. 87 type IndexConfig struct { 88 // The index name. This directly maps to a field or Param name. 89 Name string 90 91 // Contextual pattern used to convert the Param value into a string. 92 // Currently only used for dates. Can be used to, say, bump posts in the same 93 // time frame when searching for related documents. 94 // For dates it follows Go's time.Format patterns, i.e. 95 // "2006" for YYYY and "200601" for YYYYMM. 96 Pattern string 97 98 // This field's weight when doing multi-index searches. Higher is "better". 99 Weight int 100 101 // Will lower case all string values in and queries tothis index. 102 // May get better accurate results, but at a slight performance cost. 103 ToLower bool 104 } 105 106 // Document is the interface an indexable document in Hugo must fulfill. 107 type Document interface { 108 // SearchKeywords returns a list of keywords for the given index config. 109 SearchKeywords(cfg IndexConfig) ([]Keyword, error) 110 111 // When this document was or will be published. 112 PubDate() time.Time 113 } 114 115 // InvertedIndex holds an inverted index, also sometimes named posting list, which 116 // lists, for every possible search term, the documents that contain that term. 117 type InvertedIndex struct { 118 cfg Config 119 index map[string]map[Keyword][]Document 120 121 minWeight int 122 maxWeight int 123 } 124 125 func (idx *InvertedIndex) getIndexCfg(name string) (IndexConfig, bool) { 126 for _, conf := range idx.cfg.Indices { 127 if conf.Name == name { 128 return conf, true 129 } 130 } 131 132 return IndexConfig{}, false 133 } 134 135 // NewInvertedIndex creates a new InvertedIndex. 136 // Documents to index must be added in Add. 137 func NewInvertedIndex(cfg Config) *InvertedIndex { 138 idx := &InvertedIndex{index: make(map[string]map[Keyword][]Document), cfg: cfg} 139 for _, conf := range cfg.Indices { 140 idx.index[conf.Name] = make(map[Keyword][]Document) 141 if conf.Weight < idx.minWeight { 142 // By default, the weight scale starts at 0, but we allow 143 // negative weights. 144 idx.minWeight = conf.Weight 145 } 146 if conf.Weight > idx.maxWeight { 147 idx.maxWeight = conf.Weight 148 } 149 } 150 return idx 151 } 152 153 // Add documents to the inverted index. 154 // The value must support == and !=. 155 func (idx *InvertedIndex) Add(docs ...Document) error { 156 var err error 157 for _, config := range idx.cfg.Indices { 158 if config.Weight == 0 { 159 // Disabled 160 continue 161 } 162 setm := idx.index[config.Name] 163 164 for _, doc := range docs { 165 var words []Keyword 166 words, err = doc.SearchKeywords(config) 167 if err != nil { 168 continue 169 } 170 171 for _, keyword := range words { 172 setm[keyword] = append(setm[keyword], doc) 173 } 174 } 175 } 176 177 return err 178 179 } 180 181 // queryElement holds the index name and keywords that can be used to compose a 182 // search for related content. 183 type queryElement struct { 184 Index string 185 Keywords []Keyword 186 } 187 188 func newQueryElement(index string, keywords ...Keyword) queryElement { 189 return queryElement{Index: index, Keywords: keywords} 190 } 191 192 type ranks []*rank 193 194 type rank struct { 195 Doc Document 196 Weight int 197 Matches int 198 } 199 200 func (r *rank) addWeight(w int) { 201 r.Weight += w 202 r.Matches++ 203 } 204 205 func newRank(doc Document, weight int) *rank { 206 return &rank{Doc: doc, Weight: weight, Matches: 1} 207 } 208 209 func (r ranks) Len() int { return len(r) } 210 func (r ranks) Swap(i, j int) { r[i], r[j] = r[j], r[i] } 211 func (r ranks) Less(i, j int) bool { 212 if r[i].Weight == r[j].Weight { 213 return r[i].Doc.PubDate().After(r[j].Doc.PubDate()) 214 } 215 return r[i].Weight > r[j].Weight 216 } 217 218 // SearchDoc finds the documents matching any of the keywords in the given indices 219 // against the given document. 220 // The resulting document set will be sorted according to number of matches 221 // and the index weights, and any matches with a rank below the configured 222 // threshold (normalize to 0..100) will be removed. 223 // If an index name is provided, only that index will be queried. 224 func (idx *InvertedIndex) SearchDoc(doc Document, indices ...string) ([]Document, error) { 225 var q []queryElement 226 227 var configs IndexConfigs 228 229 if len(indices) == 0 { 230 configs = idx.cfg.Indices 231 } else { 232 configs = make(IndexConfigs, len(indices)) 233 for i, indexName := range indices { 234 cfg, found := idx.getIndexCfg(indexName) 235 if !found { 236 return nil, fmt.Errorf("index %q not found", indexName) 237 } 238 configs[i] = cfg 239 } 240 } 241 242 for _, cfg := range configs { 243 keywords, err := doc.SearchKeywords(cfg) 244 if err != nil { 245 return nil, err 246 } 247 248 q = append(q, newQueryElement(cfg.Name, keywords...)) 249 250 } 251 252 return idx.searchDate(doc.PubDate(), q...) 253 } 254 255 func (cfg IndexConfig) ToKeywords(v interface{}) ([]Keyword, error) { 256 var ( 257 keywords []Keyword 258 toLower = cfg.ToLower 259 ) 260 switch vv := v.(type) { 261 case string: 262 if toLower { 263 vv = strings.ToLower(vv) 264 } 265 keywords = append(keywords, StringKeyword(vv)) 266 case []string: 267 if toLower { 268 for i := 0; i < len(vv); i++ { 269 vv[i] = strings.ToLower(vv[i]) 270 } 271 } 272 keywords = append(keywords, StringsToKeywords(vv...)...) 273 case time.Time: 274 layout := "2006" 275 if cfg.Pattern != "" { 276 layout = cfg.Pattern 277 } 278 keywords = append(keywords, StringKeyword(vv.Format(layout))) 279 case nil: 280 return keywords, nil 281 default: 282 return keywords, fmt.Errorf("indexing currently not supported for for index %q and type %T", cfg.Name, vv) 283 } 284 285 return keywords, nil 286 } 287 288 // SearchKeyValues finds the documents matching any of the keywords in the given indices. 289 // The resulting document set will be sorted according to number of matches 290 // and the index weights, and any matches with a rank below the configured 291 // threshold (normalize to 0..100) will be removed. 292 func (idx *InvertedIndex) SearchKeyValues(args ...types.KeyValues) ([]Document, error) { 293 q := make([]queryElement, len(args)) 294 295 for i, arg := range args { 296 var keywords []Keyword 297 key := arg.KeyString() 298 if key == "" { 299 return nil, fmt.Errorf("index %q not valid", arg.Key) 300 } 301 conf, found := idx.getIndexCfg(key) 302 if !found { 303 return nil, fmt.Errorf("index %q not found", key) 304 } 305 306 for _, val := range arg.Values { 307 k, err := conf.ToKeywords(val) 308 if err != nil { 309 return nil, err 310 } 311 keywords = append(keywords, k...) 312 } 313 314 q[i] = newQueryElement(conf.Name, keywords...) 315 316 } 317 318 return idx.search(q...) 319 } 320 321 func (idx *InvertedIndex) search(query ...queryElement) ([]Document, error) { 322 return idx.searchDate(zeroDate, query...) 323 } 324 325 func (idx *InvertedIndex) searchDate(upperDate time.Time, query ...queryElement) ([]Document, error) { 326 matchm := make(map[Document]*rank, 200) 327 applyDateFilter := !idx.cfg.IncludeNewer && !upperDate.IsZero() 328 329 for _, el := range query { 330 setm, found := idx.index[el.Index] 331 if !found { 332 return []Document{}, fmt.Errorf("index for %q not found", el.Index) 333 } 334 335 config, found := idx.getIndexCfg(el.Index) 336 if !found { 337 return []Document{}, fmt.Errorf("index config for %q not found", el.Index) 338 } 339 340 for _, kw := range el.Keywords { 341 if docs, found := setm[kw]; found { 342 for _, doc := range docs { 343 if applyDateFilter { 344 // Exclude newer than the limit given 345 if doc.PubDate().After(upperDate) { 346 continue 347 } 348 } 349 r, found := matchm[doc] 350 if !found { 351 matchm[doc] = newRank(doc, config.Weight) 352 } else { 353 r.addWeight(config.Weight) 354 } 355 } 356 } 357 } 358 } 359 360 if len(matchm) == 0 { 361 return []Document{}, nil 362 } 363 364 matches := make(ranks, 0, 100) 365 366 for _, v := range matchm { 367 avgWeight := v.Weight / v.Matches 368 weight := norm(avgWeight, idx.minWeight, idx.maxWeight) 369 threshold := idx.cfg.Threshold / v.Matches 370 371 if weight >= threshold { 372 matches = append(matches, v) 373 } 374 } 375 376 sort.Stable(matches) 377 378 result := make([]Document, len(matches)) 379 380 for i, m := range matches { 381 result[i] = m.Doc 382 } 383 384 return result, nil 385 } 386 387 // normalizes num to a number between 0 and 100. 388 func norm(num, min, max int) int { 389 if min > max { 390 panic("min > max") 391 } 392 return int(math.Floor((float64(num-min) / float64(max-min) * 100) + 0.5)) 393 } 394 395 // DecodeConfig decodes a slice of map into Config. 396 func DecodeConfig(in interface{}) (Config, error) { 397 if in == nil { 398 return Config{}, errors.New("no related config provided") 399 } 400 401 m, ok := in.(map[string]interface{}) 402 if !ok { 403 return Config{}, fmt.Errorf("expected map[string]interface {} got %T", in) 404 } 405 406 if len(m) == 0 { 407 return Config{}, errors.New("empty related config provided") 408 } 409 410 var c Config 411 412 if err := mapstructure.WeakDecode(m, &c); err != nil { 413 return c, err 414 } 415 416 if c.Threshold < 0 || c.Threshold > 100 { 417 return Config{}, errors.New("related threshold must be between 0 and 100") 418 } 419 420 if c.ToLower { 421 for i := range c.Indices { 422 c.Indices[i].ToLower = true 423 } 424 } 425 426 return c, nil 427 } 428 429 // StringKeyword is a string search keyword. 430 type StringKeyword string 431 432 func (s StringKeyword) String() string { 433 return string(s) 434 } 435 436 // Keyword is the interface a keyword in the search index must implement. 437 type Keyword interface { 438 String() string 439 } 440 441 // StringsToKeywords converts the given slice of strings to a slice of Keyword. 442 func StringsToKeywords(s ...string) []Keyword { 443 kw := make([]Keyword, len(s)) 444 445 for i := 0; i < len(s); i++ { 446 kw[i] = StringKeyword(s[i]) 447 } 448 449 return kw 450 }