github.com/sajari/fuzzy@v1.0.0/fuzzy.go (about)

     1  package fuzzy
     2  
     3  import (
     4  	"bufio"
     5  	"encoding/json"
     6  	"errors"
     7  	"fmt"
     8  	"index/suffixarray"
     9  	"io"
    10  	"log"
    11  	"os"
    12  	"regexp"
    13  	"sort"
    14  	"strings"
    15  	"sync"
    16  )
    17  
    18  const (
    19  	SpellDepthDefault              = 2
    20  	SpellThresholdDefault          = 5
    21  	SuffDivergenceThresholdDefault = 100
    22  )
    23  
    24  type Pair struct {
    25  	str1 string
    26  	str2 string
    27  }
    28  
    29  type Method int
    30  
    31  const (
    32  	MethodIsWord                   Method = 0
    33  	MethodSuggestMapsToInput              = 1
    34  	MethodInputDeleteMapsToDict           = 2
    35  	MethodInputDeleteMapsToSuggest        = 3
    36  )
    37  
    38  type Potential struct {
    39  	Term   string // Potential term string
    40  	Score  int    // Score
    41  	Leven  int    // Levenstein distance from the suggestion to the input
    42  	Method Method // How this potential was matched
    43  }
    44  
    45  type Counts struct {
    46  	Corpus int `json:"corpus"`
    47  	Query  int `json:"query"`
    48  }
    49  
    50  type Model struct {
    51  	Data                    map[string]*Counts  `json:"data"`
    52  	Maxcount                int                 `json:"maxcount"`
    53  	Suggest                 map[string][]string `json:"suggest"`
    54  	Depth                   int                 `json:"depth"`
    55  	Threshold               int                 `json:"threshold"`
    56  	UseAutocomplete         bool                `json:"autocomplete"`
    57  	SuffDivergence          int                 `json:"-"`
    58  	SuffDivergenceThreshold int                 `json:"suff_threshold"`
    59  	SuffixArr               *suffixarray.Index  `json:"-"`
    60  	SuffixArrConcat         string              `json:"-"`
    61  	sync.RWMutex
    62  }
    63  
    64  // For sorting autocomplete suggestions
    65  // to bias the most popular first
    66  type Autos struct {
    67  	Results []string
    68  	Model   *Model
    69  }
    70  
    71  func (a Autos) Len() int      { return len(a.Results) }
    72  func (a Autos) Swap(i, j int) { a.Results[i], a.Results[j] = a.Results[j], a.Results[i] }
    73  
    74  func (a Autos) Less(i, j int) bool {
    75  	icc := a.Model.Data[a.Results[i]].Corpus
    76  	jcc := a.Model.Data[a.Results[j]].Corpus
    77  	icq := a.Model.Data[a.Results[i]].Query
    78  	jcq := a.Model.Data[a.Results[j]].Query
    79  	if icq == jcq {
    80  		if icc == jcc {
    81  			return a.Results[i] > a.Results[j]
    82  		}
    83  		return icc > jcc
    84  	}
    85  	return icq > jcq
    86  }
    87  
    88  func (m Method) String() string {
    89  	switch m {
    90  	case MethodIsWord:
    91  		return "Input in dictionary"
    92  	case MethodSuggestMapsToInput:
    93  		return "Suggest maps to input"
    94  	case MethodInputDeleteMapsToDict:
    95  		return "Input delete maps to dictionary"
    96  	case MethodInputDeleteMapsToSuggest:
    97  		return "Input delete maps to suggest key"
    98  	}
    99  	return "unknown"
   100  }
   101  
   102  func (pot *Potential) String() string {
   103  	return fmt.Sprintf("Term: %v\n\tScore: %v\n\tLeven: %v\n\tMethod: %v\n\n", pot.Term, pot.Score, pot.Leven, pot.Method)
   104  }
   105  
   106  // Create and initialise a new model
   107  func NewModel() *Model {
   108  	model := new(Model)
   109  	return model.Init()
   110  }
   111  
   112  func (model *Model) Init() *Model {
   113  	model.Data = make(map[string]*Counts)
   114  	model.Suggest = make(map[string][]string)
   115  	model.Depth = SpellDepthDefault
   116  	model.Threshold = SpellThresholdDefault // Setting this to 1 is most accurate, but "1" is 5x more memory and 30x slower processing than "4". This is a big performance tuning knob
   117  	model.UseAutocomplete = true            // Default is to include Autocomplete
   118  	model.updateSuffixArr()
   119  	model.SuffDivergenceThreshold = SuffDivergenceThresholdDefault
   120  	return model
   121  }
   122  
   123  // WriteTo writes a model to a Writer
   124  func (model *Model) WriteTo(w io.Writer) (int64, error) {
   125  	model.RLock()
   126  	defer model.RUnlock()
   127  	b, err := json.Marshal(model)
   128  	if err != nil {
   129  		return 0, err
   130  	}
   131  	n, err := w.Write(b)
   132  	if err != nil {
   133  		return int64(n), err
   134  	}
   135  	return int64(n), nil
   136  }
   137  
   138  // Save a spelling model to disk
   139  func (model *Model) Save(filename string) error {
   140  	f, err := os.Create(filename)
   141  	if err != nil {
   142  		log.Println("Fuzzy model:", err)
   143  		return err
   144  	}
   145  	defer f.Close()
   146  	_, err = model.WriteTo(f)
   147  	if err != nil {
   148  		log.Println("Fuzzy model:", err)
   149  		return err
   150  	}
   151  	return nil
   152  }
   153  
   154  // Save a spelling model to disk, but discard all
   155  // entries less than the threshold number of occurences
   156  // Much smaller and all that is used when generated
   157  // as a once off, but not useful for incremental usage
   158  func (model *Model) SaveLight(filename string) error {
   159  	model.Lock()
   160  	for term, count := range model.Data {
   161  		if count.Corpus < model.Threshold {
   162  			delete(model.Data, term)
   163  		}
   164  	}
   165  	model.Unlock()
   166  	return model.Save(filename)
   167  }
   168  
   169  // FromReader loads a model from a Reader
   170  func FromReader(r io.Reader) (*Model, error) {
   171  	model := new(Model)
   172  	d := json.NewDecoder(r)
   173  	err := d.Decode(model)
   174  	if err != nil {
   175  		return nil, err
   176  	}
   177  	model.updateSuffixArr()
   178  	return model, nil
   179  }
   180  
   181  // Load a saved model from disk
   182  func Load(filename string) (*Model, error) {
   183  	f, err := os.Open(filename)
   184  	if err != nil {
   185  		return nil, err
   186  	}
   187  	defer f.Close()
   188  	model, err := FromReader(f)
   189  	if err != nil {
   190  		model = new(Model)
   191  		if err1 := model.convertOldFormat(filename); err1 != nil {
   192  			return model, err1
   193  		}
   194  		return model, nil
   195  	}
   196  	return model, nil
   197  }
   198  
   199  // Change the default depth value of the model. This sets how many
   200  // character differences are indexed. The default is 2.
   201  func (model *Model) SetDepth(val int) {
   202  	model.Lock()
   203  	model.Depth = val
   204  	model.Unlock()
   205  }
   206  
   207  // Change the default threshold of the model. This is how many times
   208  // a term must be seen before suggestions are created for it
   209  func (model *Model) SetThreshold(val int) {
   210  	model.Lock()
   211  	model.Threshold = val
   212  	model.Unlock()
   213  }
   214  
   215  // Optionally disabled suffixarray based autocomplete support
   216  func (model *Model) SetUseAutocomplete(val bool) {
   217  	model.Lock()
   218  	old := model.UseAutocomplete
   219  	model.Unlock()
   220  	model.UseAutocomplete = val
   221  	if !old && val {
   222  		model.updateSuffixArr()
   223  	}
   224  }
   225  
   226  // Optionally set the suffix array divergence threshold. This is
   227  // the number of query training steps between rebuilds of the
   228  // suffix array. A low number will be more accurate but will use
   229  // resources and create more garbage.
   230  func (model *Model) SetDivergenceThreshold(val int) {
   231  	model.Lock()
   232  	model.SuffDivergenceThreshold = val
   233  	model.Unlock()
   234  }
   235  
   236  // Calculate the Levenshtein distance between two strings
   237  func Levenshtein(a, b *string) int {
   238  	la := len(*a)
   239  	lb := len(*b)
   240  	d := make([]int, la+1)
   241  	var lastdiag, olddiag, temp int
   242  
   243  	for i := 1; i <= la; i++ {
   244  		d[i] = i
   245  	}
   246  	for i := 1; i <= lb; i++ {
   247  		d[0] = i
   248  		lastdiag = i - 1
   249  		for j := 1; j <= la; j++ {
   250  			olddiag = d[j]
   251  			min := d[j] + 1
   252  			if (d[j-1] + 1) < min {
   253  				min = d[j-1] + 1
   254  			}
   255  			if (*a)[j-1] == (*b)[i-1] {
   256  				temp = 0
   257  			} else {
   258  				temp = 1
   259  			}
   260  			if (lastdiag + temp) < min {
   261  				min = lastdiag + temp
   262  			}
   263  			d[j] = min
   264  			lastdiag = olddiag
   265  		}
   266  	}
   267  	return d[la]
   268  }
   269  
   270  // Add an array of words to train the model in bulk
   271  func (model *Model) Train(terms []string) {
   272  	for _, term := range terms {
   273  		model.TrainWord(term)
   274  	}
   275  	model.updateSuffixArr()
   276  }
   277  
   278  // Manually set the count of a word. Optionally trigger the
   279  // creation of suggestion keys for the term. This function lets
   280  // you build a model from an existing dictionary with word popularity
   281  // counts without needing to run "TrainWord" repeatedly
   282  func (model *Model) SetCount(term string, count int, suggest bool) {
   283  	model.Lock()
   284  	model.Data[term] = &Counts{count, 0} // Note: This may reset a query count? TODO
   285  	if suggest {
   286  		model.createSuggestKeys(term)
   287  	}
   288  	model.Unlock()
   289  }
   290  
   291  // Train the model word by word. This is corpus training as opposed
   292  // to query training. Word counts from this type of training are not
   293  // likely to correlate with those of search queries
   294  func (model *Model) TrainWord(term string) {
   295  	model.Lock()
   296  	if t, ok := model.Data[term]; ok {
   297  		t.Corpus++
   298  	} else {
   299  		model.Data[term] = &Counts{1, 0}
   300  	}
   301  	// Set the max
   302  	if model.Data[term].Corpus > model.Maxcount {
   303  		model.Maxcount = model.Data[term].Corpus
   304  		model.SuffDivergence++
   305  	}
   306  	// If threshold is triggered, store delete suggestion keys
   307  	if model.Data[term].Corpus == model.Threshold {
   308  		model.createSuggestKeys(term)
   309  	}
   310  	model.Unlock()
   311  }
   312  
   313  // Train using a search query term. This builds a second popularity
   314  // index of terms used to search, as opposed to generally occurring
   315  // in corpus text
   316  func (model *Model) TrainQuery(term string) {
   317  	model.Lock()
   318  	if t, ok := model.Data[term]; ok {
   319  		t.Query++
   320  	} else {
   321  		model.Data[term] = &Counts{0, 1}
   322  	}
   323  	model.SuffDivergence++
   324  	update := model.SuffDivergence > model.SuffDivergenceThreshold
   325  	model.Unlock()
   326  	if update {
   327  		model.updateSuffixArr()
   328  	}
   329  }
   330  
   331  // For a given term, create the partially deleted lookup keys
   332  func (model *Model) createSuggestKeys(term string) {
   333  	edits := model.EditsMulti(term, model.Depth)
   334  	for _, edit := range edits {
   335  		skip := false
   336  		for _, hit := range model.Suggest[edit] {
   337  			if hit == term {
   338  				// Already know about this one
   339  				skip = true
   340  				continue
   341  			}
   342  		}
   343  		if !skip && len(edit) > 1 {
   344  			model.Suggest[edit] = append(model.Suggest[edit], term)
   345  		}
   346  	}
   347  }
   348  
   349  // Edits at any depth for a given term. The depth of the model is used
   350  func (model *Model) EditsMulti(term string, depth int) []string {
   351  	edits := Edits1(term)
   352  	for {
   353  		depth--
   354  		if depth <= 0 {
   355  			break
   356  		}
   357  		for _, edit := range edits {
   358  			edits2 := Edits1(edit)
   359  			for _, edit2 := range edits2 {
   360  				edits = append(edits, edit2)
   361  			}
   362  		}
   363  	}
   364  	return edits
   365  }
   366  
   367  // Edits1 creates a set of terms that are 1 char delete from the input term
   368  func Edits1(word string) []string {
   369  
   370  	splits := []Pair{}
   371  	for i := 0; i <= len(word); i++ {
   372  		splits = append(splits, Pair{word[:i], word[i:]})
   373  	}
   374  
   375  	total_set := []string{}
   376  	for _, elem := range splits {
   377  
   378  		//deletion
   379  		if len(elem.str2) > 0 {
   380  			total_set = append(total_set, elem.str1+elem.str2[1:])
   381  		} else {
   382  			total_set = append(total_set, elem.str1)
   383  		}
   384  
   385  	}
   386  
   387  	// Special case ending in "ies" or "ys"
   388  	if strings.HasSuffix(word, "ies") {
   389  		total_set = append(total_set, word[:len(word)-3]+"ys")
   390  	}
   391  	if strings.HasSuffix(word, "ys") {
   392  		total_set = append(total_set, word[:len(word)-2]+"ies")
   393  	}
   394  
   395  	return total_set
   396  }
   397  
   398  func (model *Model) corpusCount(input string) int {
   399  	if score, ok := model.Data[input]; ok {
   400  		return score.Corpus
   401  	}
   402  	return 0
   403  }
   404  
   405  // From a group of potentials, work out the most likely result
   406  func best(input string, potential map[string]*Potential) string {
   407  	var best string
   408  	var bestcalc, bonus int
   409  	for i := 0; i < 4; i++ {
   410  		for _, pot := range potential {
   411  			if pot.Leven == 0 {
   412  				return pot.Term
   413  			} else if pot.Leven == i {
   414  				bonus = 0
   415  				// If the first letter is the same, that's a good sign. Bias these potentials
   416  				if pot.Term[0] == input[0] {
   417  					bonus += 100
   418  				}
   419  				if pot.Score+bonus > bestcalc {
   420  					bestcalc = pot.Score + bonus
   421  					best = pot.Term
   422  				}
   423  			}
   424  		}
   425  		if bestcalc > 0 {
   426  			return best
   427  		}
   428  	}
   429  	return best
   430  }
   431  
   432  // From a group of potentials, work out the most likely results, in order of
   433  // best to worst
   434  func bestn(input string, potential map[string]*Potential, n int) []string {
   435  	var output []string
   436  	for i := 0; i < n; i++ {
   437  		if len(potential) == 0 {
   438  			break
   439  		}
   440  		b := best(input, potential)
   441  		output = append(output, b)
   442  		delete(potential, b)
   443  	}
   444  	return output
   445  }
   446  
   447  // Test an input, if we get it wrong, look at why it is wrong. This
   448  // function returns a bool indicating if the guess was correct as well
   449  // as the term it is suggesting. Typically this function would be used
   450  // for testing, not for production
   451  func (model *Model) CheckKnown(input string, correct string) bool {
   452  	model.RLock()
   453  	defer model.RUnlock()
   454  	suggestions := model.suggestPotential(input, true)
   455  	best := best(input, suggestions)
   456  	if best == correct {
   457  		// This guess is correct
   458  		fmt.Printf("Input correctly maps to correct term")
   459  		return true
   460  	}
   461  	if pot, ok := suggestions[correct]; !ok {
   462  
   463  		if model.corpusCount(correct) > 0 {
   464  			fmt.Printf("\"%v\" - %v (%v) not in the suggestions. (%v) best option.\n", input, correct, model.corpusCount(correct), best)
   465  			for _, sugg := range suggestions {
   466  				fmt.Printf("	%v\n", sugg)
   467  			}
   468  		} else {
   469  			fmt.Printf("\"%v\" - Not in dictionary\n", correct)
   470  		}
   471  	} else {
   472  		fmt.Printf("\"%v\" - (%v) suggested, should however be (%v).\n", input, suggestions[best], pot)
   473  	}
   474  	return false
   475  }
   476  
   477  // For a given input term, suggest some alternatives. If exhaustive, each of the 4
   478  // cascading checks will be performed and all potentials will be sorted accordingly
   479  func (model *Model) suggestPotential(input string, exhaustive bool) map[string]*Potential {
   480  	input = strings.ToLower(input)
   481  	suggestions := make(map[string]*Potential, 20)
   482  
   483  	// 0 - If this is a dictionary term we're all good, no need to go further
   484  	if model.corpusCount(input) > model.Threshold {
   485  		suggestions[input] = &Potential{Term: input, Score: model.corpusCount(input), Leven: 0, Method: MethodIsWord}
   486  		if !exhaustive {
   487  			return suggestions
   488  		}
   489  	}
   490  
   491  	// 1 - See if the input matches a "suggest" key
   492  	if sugg, ok := model.Suggest[input]; ok {
   493  		for _, pot := range sugg {
   494  			if _, ok := suggestions[pot]; !ok {
   495  				suggestions[pot] = &Potential{Term: pot, Score: model.corpusCount(pot), Leven: Levenshtein(&input, &pot), Method: MethodSuggestMapsToInput}
   496  			}
   497  		}
   498  
   499  		if !exhaustive {
   500  			return suggestions
   501  		}
   502  	}
   503  
   504  	// 2 - See if edit1 matches input
   505  	max := 0
   506  	edits := model.EditsMulti(input, model.Depth)
   507  	for _, edit := range edits {
   508  		score := model.corpusCount(edit)
   509  		if score > 0 && len(edit) > 2 {
   510  			if _, ok := suggestions[edit]; !ok {
   511  				suggestions[edit] = &Potential{Term: edit, Score: score, Leven: Levenshtein(&input, &edit), Method: MethodInputDeleteMapsToDict}
   512  			}
   513  			if score > max {
   514  				max = score
   515  			}
   516  		}
   517  	}
   518  	if max > 0 {
   519  		if !exhaustive {
   520  			return suggestions
   521  		}
   522  	}
   523  
   524  	// 3 - No hits on edit1 distance, look for transposes and replaces
   525  	// Note: these are more complex, we need to check the guesses
   526  	// more thoroughly, e.g. levals=[valves] in a raw sense, which
   527  	// is incorrect
   528  	for _, edit := range edits {
   529  		if sugg, ok := model.Suggest[edit]; ok {
   530  			// Is this a real transpose or replace?
   531  			for _, pot := range sugg {
   532  				lev := Levenshtein(&input, &pot)
   533  				if lev <= model.Depth+1 { // The +1 doesn't seem to impact speed, but has greater coverage when the depth is not sufficient to make suggestions
   534  					if _, ok := suggestions[pot]; !ok {
   535  						suggestions[pot] = &Potential{Term: pot, Score: model.corpusCount(pot), Leven: lev, Method: MethodInputDeleteMapsToSuggest}
   536  					}
   537  				}
   538  			}
   539  		}
   540  	}
   541  	return suggestions
   542  }
   543  
   544  // Return the raw potential terms so they can be ranked externally
   545  // to this package
   546  func (model *Model) Potentials(input string, exhaustive bool) map[string]*Potential {
   547  	model.RLock()
   548  	defer model.RUnlock()
   549  	return model.suggestPotential(input, exhaustive)
   550  }
   551  
   552  // For a given input string, suggests potential replacements
   553  func (model *Model) Suggestions(input string, exhaustive bool) []string {
   554  	model.RLock()
   555  	suggestions := model.suggestPotential(input, exhaustive)
   556  	model.RUnlock()
   557  	output := make([]string, 0, 10)
   558  	for _, suggestion := range suggestions {
   559  		output = append(output, suggestion.Term)
   560  	}
   561  	return output
   562  }
   563  
   564  // Return the most likely correction for the input term
   565  func (model *Model) SpellCheck(input string) string {
   566  	model.RLock()
   567  	suggestions := model.suggestPotential(input, false)
   568  	model.RUnlock()
   569  	return best(input, suggestions)
   570  }
   571  
   572  // Return the most likely corrections in order from best to worst
   573  func (model *Model) SpellCheckSuggestions(input string, n int) []string {
   574  	model.RLock()
   575  	suggestions := model.suggestPotential(input, true)
   576  	model.RUnlock()
   577  	return bestn(input, suggestions, n)
   578  }
   579  
   580  func SampleEnglish() []string {
   581  	var out []string
   582  	file, err := os.Open("data/big.txt")
   583  	if err != nil {
   584  		fmt.Println(err)
   585  		return out
   586  	}
   587  	reader := bufio.NewReader(file)
   588  	scanner := bufio.NewScanner(reader)
   589  	scanner.Split(bufio.ScanLines)
   590  	// Count the words.
   591  	count := 0
   592  	for scanner.Scan() {
   593  		exp, _ := regexp.Compile("[a-zA-Z]+")
   594  		words := exp.FindAll([]byte(scanner.Text()), -1)
   595  		for _, word := range words {
   596  			if len(word) > 1 {
   597  				out = append(out, strings.ToLower(string(word)))
   598  				count++
   599  			}
   600  		}
   601  	}
   602  	if err := scanner.Err(); err != nil {
   603  		fmt.Fprintln(os.Stderr, "reading input:", err)
   604  	}
   605  
   606  	return out
   607  }
   608  
   609  // Takes the known dictionary listing and creates a suffix array
   610  // model for these terms. If a model already existed, it is discarded
   611  func (model *Model) updateSuffixArr() {
   612  	if !model.UseAutocomplete {
   613  		return
   614  	}
   615  	model.RLock()
   616  	termArr := make([]string, 0, 1000)
   617  	for term, count := range model.Data {
   618  		if count.Corpus > model.Threshold || count.Query > 0 { // TODO: query threshold?
   619  			termArr = append(termArr, term)
   620  		}
   621  	}
   622  	model.SuffixArrConcat = "\x00" + strings.Join(termArr, "\x00") + "\x00"
   623  	model.SuffixArr = suffixarray.New([]byte(model.SuffixArrConcat))
   624  	model.SuffDivergence = 0
   625  	model.RUnlock()
   626  }
   627  
   628  // For a given string, autocomplete using the suffix array model
   629  func (model *Model) Autocomplete(input string) ([]string, error) {
   630  	model.RLock()
   631  	defer model.RUnlock()
   632  	if !model.UseAutocomplete {
   633  		return []string{}, errors.New("Autocomplete is disabled")
   634  	}
   635  	if len(input) == 0 {
   636  		return []string{}, errors.New("Input cannot have length zero")
   637  	}
   638  	express := "\x00" + input + "[^\x00]*"
   639  	match, err := regexp.Compile(express)
   640  	if err != nil {
   641  		return []string{}, err
   642  	}
   643  	matches := model.SuffixArr.FindAllIndex(match, -1)
   644  	a := &Autos{Results: make([]string, 0, len(matches)), Model: model}
   645  	for _, m := range matches {
   646  		str := strings.Trim(model.SuffixArrConcat[m[0]:m[1]], "\x00")
   647  		if count, ok := model.Data[str]; ok {
   648  			if count.Corpus > model.Threshold || count.Query > 0 {
   649  				a.Results = append(a.Results, str)
   650  			}
   651  		}
   652  	}
   653  	sort.Sort(a)
   654  	if len(a.Results) >= 10 {
   655  		return a.Results[:10], nil
   656  	}
   657  	return a.Results, nil
   658  }