github.com/sajari/fuzzy@v1.0.0/fuzzy.go (about) 1 package fuzzy 2 3 import ( 4 "bufio" 5 "encoding/json" 6 "errors" 7 "fmt" 8 "index/suffixarray" 9 "io" 10 "log" 11 "os" 12 "regexp" 13 "sort" 14 "strings" 15 "sync" 16 ) 17 18 const ( 19 SpellDepthDefault = 2 20 SpellThresholdDefault = 5 21 SuffDivergenceThresholdDefault = 100 22 ) 23 24 type Pair struct { 25 str1 string 26 str2 string 27 } 28 29 type Method int 30 31 const ( 32 MethodIsWord Method = 0 33 MethodSuggestMapsToInput = 1 34 MethodInputDeleteMapsToDict = 2 35 MethodInputDeleteMapsToSuggest = 3 36 ) 37 38 type Potential struct { 39 Term string // Potential term string 40 Score int // Score 41 Leven int // Levenstein distance from the suggestion to the input 42 Method Method // How this potential was matched 43 } 44 45 type Counts struct { 46 Corpus int `json:"corpus"` 47 Query int `json:"query"` 48 } 49 50 type Model struct { 51 Data map[string]*Counts `json:"data"` 52 Maxcount int `json:"maxcount"` 53 Suggest map[string][]string `json:"suggest"` 54 Depth int `json:"depth"` 55 Threshold int `json:"threshold"` 56 UseAutocomplete bool `json:"autocomplete"` 57 SuffDivergence int `json:"-"` 58 SuffDivergenceThreshold int `json:"suff_threshold"` 59 SuffixArr *suffixarray.Index `json:"-"` 60 SuffixArrConcat string `json:"-"` 61 sync.RWMutex 62 } 63 64 // For sorting autocomplete suggestions 65 // to bias the most popular first 66 type Autos struct { 67 Results []string 68 Model *Model 69 } 70 71 func (a Autos) Len() int { return len(a.Results) } 72 func (a Autos) Swap(i, j int) { a.Results[i], a.Results[j] = a.Results[j], a.Results[i] } 73 74 func (a Autos) Less(i, j int) bool { 75 icc := a.Model.Data[a.Results[i]].Corpus 76 jcc := a.Model.Data[a.Results[j]].Corpus 77 icq := a.Model.Data[a.Results[i]].Query 78 jcq := a.Model.Data[a.Results[j]].Query 79 if icq == jcq { 80 if icc == jcc { 81 return a.Results[i] > a.Results[j] 82 } 83 return icc > jcc 84 } 85 return icq > jcq 86 } 87 88 func (m Method) String() string { 89 switch m { 90 case MethodIsWord: 91 return "Input in dictionary" 92 case MethodSuggestMapsToInput: 93 return "Suggest maps to input" 94 case MethodInputDeleteMapsToDict: 95 return "Input delete maps to dictionary" 96 case MethodInputDeleteMapsToSuggest: 97 return "Input delete maps to suggest key" 98 } 99 return "unknown" 100 } 101 102 func (pot *Potential) String() string { 103 return fmt.Sprintf("Term: %v\n\tScore: %v\n\tLeven: %v\n\tMethod: %v\n\n", pot.Term, pot.Score, pot.Leven, pot.Method) 104 } 105 106 // Create and initialise a new model 107 func NewModel() *Model { 108 model := new(Model) 109 return model.Init() 110 } 111 112 func (model *Model) Init() *Model { 113 model.Data = make(map[string]*Counts) 114 model.Suggest = make(map[string][]string) 115 model.Depth = SpellDepthDefault 116 model.Threshold = SpellThresholdDefault // Setting this to 1 is most accurate, but "1" is 5x more memory and 30x slower processing than "4". This is a big performance tuning knob 117 model.UseAutocomplete = true // Default is to include Autocomplete 118 model.updateSuffixArr() 119 model.SuffDivergenceThreshold = SuffDivergenceThresholdDefault 120 return model 121 } 122 123 // WriteTo writes a model to a Writer 124 func (model *Model) WriteTo(w io.Writer) (int64, error) { 125 model.RLock() 126 defer model.RUnlock() 127 b, err := json.Marshal(model) 128 if err != nil { 129 return 0, err 130 } 131 n, err := w.Write(b) 132 if err != nil { 133 return int64(n), err 134 } 135 return int64(n), nil 136 } 137 138 // Save a spelling model to disk 139 func (model *Model) Save(filename string) error { 140 f, err := os.Create(filename) 141 if err != nil { 142 log.Println("Fuzzy model:", err) 143 return err 144 } 145 defer f.Close() 146 _, err = model.WriteTo(f) 147 if err != nil { 148 log.Println("Fuzzy model:", err) 149 return err 150 } 151 return nil 152 } 153 154 // Save a spelling model to disk, but discard all 155 // entries less than the threshold number of occurences 156 // Much smaller and all that is used when generated 157 // as a once off, but not useful for incremental usage 158 func (model *Model) SaveLight(filename string) error { 159 model.Lock() 160 for term, count := range model.Data { 161 if count.Corpus < model.Threshold { 162 delete(model.Data, term) 163 } 164 } 165 model.Unlock() 166 return model.Save(filename) 167 } 168 169 // FromReader loads a model from a Reader 170 func FromReader(r io.Reader) (*Model, error) { 171 model := new(Model) 172 d := json.NewDecoder(r) 173 err := d.Decode(model) 174 if err != nil { 175 return nil, err 176 } 177 model.updateSuffixArr() 178 return model, nil 179 } 180 181 // Load a saved model from disk 182 func Load(filename string) (*Model, error) { 183 f, err := os.Open(filename) 184 if err != nil { 185 return nil, err 186 } 187 defer f.Close() 188 model, err := FromReader(f) 189 if err != nil { 190 model = new(Model) 191 if err1 := model.convertOldFormat(filename); err1 != nil { 192 return model, err1 193 } 194 return model, nil 195 } 196 return model, nil 197 } 198 199 // Change the default depth value of the model. This sets how many 200 // character differences are indexed. The default is 2. 201 func (model *Model) SetDepth(val int) { 202 model.Lock() 203 model.Depth = val 204 model.Unlock() 205 } 206 207 // Change the default threshold of the model. This is how many times 208 // a term must be seen before suggestions are created for it 209 func (model *Model) SetThreshold(val int) { 210 model.Lock() 211 model.Threshold = val 212 model.Unlock() 213 } 214 215 // Optionally disabled suffixarray based autocomplete support 216 func (model *Model) SetUseAutocomplete(val bool) { 217 model.Lock() 218 old := model.UseAutocomplete 219 model.Unlock() 220 model.UseAutocomplete = val 221 if !old && val { 222 model.updateSuffixArr() 223 } 224 } 225 226 // Optionally set the suffix array divergence threshold. This is 227 // the number of query training steps between rebuilds of the 228 // suffix array. A low number will be more accurate but will use 229 // resources and create more garbage. 230 func (model *Model) SetDivergenceThreshold(val int) { 231 model.Lock() 232 model.SuffDivergenceThreshold = val 233 model.Unlock() 234 } 235 236 // Calculate the Levenshtein distance between two strings 237 func Levenshtein(a, b *string) int { 238 la := len(*a) 239 lb := len(*b) 240 d := make([]int, la+1) 241 var lastdiag, olddiag, temp int 242 243 for i := 1; i <= la; i++ { 244 d[i] = i 245 } 246 for i := 1; i <= lb; i++ { 247 d[0] = i 248 lastdiag = i - 1 249 for j := 1; j <= la; j++ { 250 olddiag = d[j] 251 min := d[j] + 1 252 if (d[j-1] + 1) < min { 253 min = d[j-1] + 1 254 } 255 if (*a)[j-1] == (*b)[i-1] { 256 temp = 0 257 } else { 258 temp = 1 259 } 260 if (lastdiag + temp) < min { 261 min = lastdiag + temp 262 } 263 d[j] = min 264 lastdiag = olddiag 265 } 266 } 267 return d[la] 268 } 269 270 // Add an array of words to train the model in bulk 271 func (model *Model) Train(terms []string) { 272 for _, term := range terms { 273 model.TrainWord(term) 274 } 275 model.updateSuffixArr() 276 } 277 278 // Manually set the count of a word. Optionally trigger the 279 // creation of suggestion keys for the term. This function lets 280 // you build a model from an existing dictionary with word popularity 281 // counts without needing to run "TrainWord" repeatedly 282 func (model *Model) SetCount(term string, count int, suggest bool) { 283 model.Lock() 284 model.Data[term] = &Counts{count, 0} // Note: This may reset a query count? TODO 285 if suggest { 286 model.createSuggestKeys(term) 287 } 288 model.Unlock() 289 } 290 291 // Train the model word by word. This is corpus training as opposed 292 // to query training. Word counts from this type of training are not 293 // likely to correlate with those of search queries 294 func (model *Model) TrainWord(term string) { 295 model.Lock() 296 if t, ok := model.Data[term]; ok { 297 t.Corpus++ 298 } else { 299 model.Data[term] = &Counts{1, 0} 300 } 301 // Set the max 302 if model.Data[term].Corpus > model.Maxcount { 303 model.Maxcount = model.Data[term].Corpus 304 model.SuffDivergence++ 305 } 306 // If threshold is triggered, store delete suggestion keys 307 if model.Data[term].Corpus == model.Threshold { 308 model.createSuggestKeys(term) 309 } 310 model.Unlock() 311 } 312 313 // Train using a search query term. This builds a second popularity 314 // index of terms used to search, as opposed to generally occurring 315 // in corpus text 316 func (model *Model) TrainQuery(term string) { 317 model.Lock() 318 if t, ok := model.Data[term]; ok { 319 t.Query++ 320 } else { 321 model.Data[term] = &Counts{0, 1} 322 } 323 model.SuffDivergence++ 324 update := model.SuffDivergence > model.SuffDivergenceThreshold 325 model.Unlock() 326 if update { 327 model.updateSuffixArr() 328 } 329 } 330 331 // For a given term, create the partially deleted lookup keys 332 func (model *Model) createSuggestKeys(term string) { 333 edits := model.EditsMulti(term, model.Depth) 334 for _, edit := range edits { 335 skip := false 336 for _, hit := range model.Suggest[edit] { 337 if hit == term { 338 // Already know about this one 339 skip = true 340 continue 341 } 342 } 343 if !skip && len(edit) > 1 { 344 model.Suggest[edit] = append(model.Suggest[edit], term) 345 } 346 } 347 } 348 349 // Edits at any depth for a given term. The depth of the model is used 350 func (model *Model) EditsMulti(term string, depth int) []string { 351 edits := Edits1(term) 352 for { 353 depth-- 354 if depth <= 0 { 355 break 356 } 357 for _, edit := range edits { 358 edits2 := Edits1(edit) 359 for _, edit2 := range edits2 { 360 edits = append(edits, edit2) 361 } 362 } 363 } 364 return edits 365 } 366 367 // Edits1 creates a set of terms that are 1 char delete from the input term 368 func Edits1(word string) []string { 369 370 splits := []Pair{} 371 for i := 0; i <= len(word); i++ { 372 splits = append(splits, Pair{word[:i], word[i:]}) 373 } 374 375 total_set := []string{} 376 for _, elem := range splits { 377 378 //deletion 379 if len(elem.str2) > 0 { 380 total_set = append(total_set, elem.str1+elem.str2[1:]) 381 } else { 382 total_set = append(total_set, elem.str1) 383 } 384 385 } 386 387 // Special case ending in "ies" or "ys" 388 if strings.HasSuffix(word, "ies") { 389 total_set = append(total_set, word[:len(word)-3]+"ys") 390 } 391 if strings.HasSuffix(word, "ys") { 392 total_set = append(total_set, word[:len(word)-2]+"ies") 393 } 394 395 return total_set 396 } 397 398 func (model *Model) corpusCount(input string) int { 399 if score, ok := model.Data[input]; ok { 400 return score.Corpus 401 } 402 return 0 403 } 404 405 // From a group of potentials, work out the most likely result 406 func best(input string, potential map[string]*Potential) string { 407 var best string 408 var bestcalc, bonus int 409 for i := 0; i < 4; i++ { 410 for _, pot := range potential { 411 if pot.Leven == 0 { 412 return pot.Term 413 } else if pot.Leven == i { 414 bonus = 0 415 // If the first letter is the same, that's a good sign. Bias these potentials 416 if pot.Term[0] == input[0] { 417 bonus += 100 418 } 419 if pot.Score+bonus > bestcalc { 420 bestcalc = pot.Score + bonus 421 best = pot.Term 422 } 423 } 424 } 425 if bestcalc > 0 { 426 return best 427 } 428 } 429 return best 430 } 431 432 // From a group of potentials, work out the most likely results, in order of 433 // best to worst 434 func bestn(input string, potential map[string]*Potential, n int) []string { 435 var output []string 436 for i := 0; i < n; i++ { 437 if len(potential) == 0 { 438 break 439 } 440 b := best(input, potential) 441 output = append(output, b) 442 delete(potential, b) 443 } 444 return output 445 } 446 447 // Test an input, if we get it wrong, look at why it is wrong. This 448 // function returns a bool indicating if the guess was correct as well 449 // as the term it is suggesting. Typically this function would be used 450 // for testing, not for production 451 func (model *Model) CheckKnown(input string, correct string) bool { 452 model.RLock() 453 defer model.RUnlock() 454 suggestions := model.suggestPotential(input, true) 455 best := best(input, suggestions) 456 if best == correct { 457 // This guess is correct 458 fmt.Printf("Input correctly maps to correct term") 459 return true 460 } 461 if pot, ok := suggestions[correct]; !ok { 462 463 if model.corpusCount(correct) > 0 { 464 fmt.Printf("\"%v\" - %v (%v) not in the suggestions. (%v) best option.\n", input, correct, model.corpusCount(correct), best) 465 for _, sugg := range suggestions { 466 fmt.Printf(" %v\n", sugg) 467 } 468 } else { 469 fmt.Printf("\"%v\" - Not in dictionary\n", correct) 470 } 471 } else { 472 fmt.Printf("\"%v\" - (%v) suggested, should however be (%v).\n", input, suggestions[best], pot) 473 } 474 return false 475 } 476 477 // For a given input term, suggest some alternatives. If exhaustive, each of the 4 478 // cascading checks will be performed and all potentials will be sorted accordingly 479 func (model *Model) suggestPotential(input string, exhaustive bool) map[string]*Potential { 480 input = strings.ToLower(input) 481 suggestions := make(map[string]*Potential, 20) 482 483 // 0 - If this is a dictionary term we're all good, no need to go further 484 if model.corpusCount(input) > model.Threshold { 485 suggestions[input] = &Potential{Term: input, Score: model.corpusCount(input), Leven: 0, Method: MethodIsWord} 486 if !exhaustive { 487 return suggestions 488 } 489 } 490 491 // 1 - See if the input matches a "suggest" key 492 if sugg, ok := model.Suggest[input]; ok { 493 for _, pot := range sugg { 494 if _, ok := suggestions[pot]; !ok { 495 suggestions[pot] = &Potential{Term: pot, Score: model.corpusCount(pot), Leven: Levenshtein(&input, &pot), Method: MethodSuggestMapsToInput} 496 } 497 } 498 499 if !exhaustive { 500 return suggestions 501 } 502 } 503 504 // 2 - See if edit1 matches input 505 max := 0 506 edits := model.EditsMulti(input, model.Depth) 507 for _, edit := range edits { 508 score := model.corpusCount(edit) 509 if score > 0 && len(edit) > 2 { 510 if _, ok := suggestions[edit]; !ok { 511 suggestions[edit] = &Potential{Term: edit, Score: score, Leven: Levenshtein(&input, &edit), Method: MethodInputDeleteMapsToDict} 512 } 513 if score > max { 514 max = score 515 } 516 } 517 } 518 if max > 0 { 519 if !exhaustive { 520 return suggestions 521 } 522 } 523 524 // 3 - No hits on edit1 distance, look for transposes and replaces 525 // Note: these are more complex, we need to check the guesses 526 // more thoroughly, e.g. levals=[valves] in a raw sense, which 527 // is incorrect 528 for _, edit := range edits { 529 if sugg, ok := model.Suggest[edit]; ok { 530 // Is this a real transpose or replace? 531 for _, pot := range sugg { 532 lev := Levenshtein(&input, &pot) 533 if lev <= model.Depth+1 { // The +1 doesn't seem to impact speed, but has greater coverage when the depth is not sufficient to make suggestions 534 if _, ok := suggestions[pot]; !ok { 535 suggestions[pot] = &Potential{Term: pot, Score: model.corpusCount(pot), Leven: lev, Method: MethodInputDeleteMapsToSuggest} 536 } 537 } 538 } 539 } 540 } 541 return suggestions 542 } 543 544 // Return the raw potential terms so they can be ranked externally 545 // to this package 546 func (model *Model) Potentials(input string, exhaustive bool) map[string]*Potential { 547 model.RLock() 548 defer model.RUnlock() 549 return model.suggestPotential(input, exhaustive) 550 } 551 552 // For a given input string, suggests potential replacements 553 func (model *Model) Suggestions(input string, exhaustive bool) []string { 554 model.RLock() 555 suggestions := model.suggestPotential(input, exhaustive) 556 model.RUnlock() 557 output := make([]string, 0, 10) 558 for _, suggestion := range suggestions { 559 output = append(output, suggestion.Term) 560 } 561 return output 562 } 563 564 // Return the most likely correction for the input term 565 func (model *Model) SpellCheck(input string) string { 566 model.RLock() 567 suggestions := model.suggestPotential(input, false) 568 model.RUnlock() 569 return best(input, suggestions) 570 } 571 572 // Return the most likely corrections in order from best to worst 573 func (model *Model) SpellCheckSuggestions(input string, n int) []string { 574 model.RLock() 575 suggestions := model.suggestPotential(input, true) 576 model.RUnlock() 577 return bestn(input, suggestions, n) 578 } 579 580 func SampleEnglish() []string { 581 var out []string 582 file, err := os.Open("data/big.txt") 583 if err != nil { 584 fmt.Println(err) 585 return out 586 } 587 reader := bufio.NewReader(file) 588 scanner := bufio.NewScanner(reader) 589 scanner.Split(bufio.ScanLines) 590 // Count the words. 591 count := 0 592 for scanner.Scan() { 593 exp, _ := regexp.Compile("[a-zA-Z]+") 594 words := exp.FindAll([]byte(scanner.Text()), -1) 595 for _, word := range words { 596 if len(word) > 1 { 597 out = append(out, strings.ToLower(string(word))) 598 count++ 599 } 600 } 601 } 602 if err := scanner.Err(); err != nil { 603 fmt.Fprintln(os.Stderr, "reading input:", err) 604 } 605 606 return out 607 } 608 609 // Takes the known dictionary listing and creates a suffix array 610 // model for these terms. If a model already existed, it is discarded 611 func (model *Model) updateSuffixArr() { 612 if !model.UseAutocomplete { 613 return 614 } 615 model.RLock() 616 termArr := make([]string, 0, 1000) 617 for term, count := range model.Data { 618 if count.Corpus > model.Threshold || count.Query > 0 { // TODO: query threshold? 619 termArr = append(termArr, term) 620 } 621 } 622 model.SuffixArrConcat = "\x00" + strings.Join(termArr, "\x00") + "\x00" 623 model.SuffixArr = suffixarray.New([]byte(model.SuffixArrConcat)) 624 model.SuffDivergence = 0 625 model.RUnlock() 626 } 627 628 // For a given string, autocomplete using the suffix array model 629 func (model *Model) Autocomplete(input string) ([]string, error) { 630 model.RLock() 631 defer model.RUnlock() 632 if !model.UseAutocomplete { 633 return []string{}, errors.New("Autocomplete is disabled") 634 } 635 if len(input) == 0 { 636 return []string{}, errors.New("Input cannot have length zero") 637 } 638 express := "\x00" + input + "[^\x00]*" 639 match, err := regexp.Compile(express) 640 if err != nil { 641 return []string{}, err 642 } 643 matches := model.SuffixArr.FindAllIndex(match, -1) 644 a := &Autos{Results: make([]string, 0, len(matches)), Model: model} 645 for _, m := range matches { 646 str := strings.Trim(model.SuffixArrConcat[m[0]:m[1]], "\x00") 647 if count, ok := model.Data[str]; ok { 648 if count.Corpus > model.Threshold || count.Query > 0 { 649 a.Results = append(a.Results, str) 650 } 651 } 652 } 653 sort.Sort(a) 654 if len(a.Results) >= 10 { 655 return a.Results[:10], nil 656 } 657 return a.Results, nil 658 }