github.com/lazin/go-ngram@v0.0.0-20160527144230-80eaf16ac4eb/ngram.go (about) 1 package ngram 2 3 import ( 4 "errors" 5 "math" 6 "sync" 7 8 "github.com/spaolacci/murmur3" 9 ) 10 11 const ( 12 maxN = 8 13 defaultPad = "$" 14 defaultN = 3 15 ) 16 17 // TokenID is just id of the token 18 type TokenID int 19 20 type nGramValue map[TokenID]int 21 22 // NGramIndex can be initialized by default (zeroed) or created with "NewNgramIndex" 23 type NGramIndex struct { 24 pad string 25 n int 26 spool stringPool 27 index map[uint32]nGramValue 28 warp float64 29 30 sync.RWMutex 31 } 32 33 // SearchResult contains token id and similarity - value in range from 0.0 to 1.0 34 type SearchResult struct { 35 TokenID TokenID 36 Similarity float64 37 } 38 39 func (ngram *NGramIndex) splitInput(str string) ([]uint32, error) { 40 if len(str) == 0 { 41 return nil, errors.New("empty string") 42 } 43 pad := ngram.pad 44 n := ngram.n 45 input := pad + str + pad 46 prevIndexes := make([]int, maxN) 47 var counter int 48 results := make([]uint32, 0) 49 50 for index := range input { 51 counter++ 52 if counter > n { 53 top := prevIndexes[(counter-n)%maxN] 54 substr := input[top:index] 55 hash := murmur3.Sum32([]byte(substr)) 56 results = append(results, hash) 57 } 58 prevIndexes[counter%maxN] = index 59 } 60 61 for i := n - 1; i > 1; i-- { 62 if len(input) >= i { 63 top := prevIndexes[(len(input)-i)%maxN] 64 substr := input[top:] 65 hash := murmur3.Sum32([]byte(substr)) 66 results = append(results, hash) 67 } 68 } 69 70 return results, nil 71 } 72 73 func (ngram *NGramIndex) init() { 74 ngram.Lock() 75 defer ngram.Unlock() 76 77 ngram.index = make(map[uint32]nGramValue) 78 if ngram.pad == "" { 79 ngram.pad = defaultPad 80 } 81 if ngram.n == 0 { 82 ngram.n = defaultN 83 } 84 if ngram.warp == 0.0 { 85 ngram.warp = 1.0 86 } 87 } 88 89 type Option func(*NGramIndex) error 90 91 // SetPad must be used to pass padding character to NGramIndex c-tor 92 func SetPad(c rune) Option { 93 return func(ngram *NGramIndex) error { 94 ngram.pad = string(c) 95 return nil 96 } 97 } 98 99 // SetN must be used to pass N (gram size) to NGramIndex c-tor 100 func SetN(n int) Option { 101 return func(ngram *NGramIndex) error { 102 if n < 2 || n > maxN { 103 return errors.New("bad 'n' value for n-gram index") 104 } 105 ngram.n = n 106 return nil 107 } 108 } 109 110 // SetWarp must be used to pass warp to NGramIndex c-tor 111 func SetWarp(warp float64) Option { 112 return func(ngram *NGramIndex) error { 113 if warp < 0.0 || warp > 1.0 { 114 return errors.New("bad 'warp' value for n-gram index") 115 } 116 ngram.warp = warp 117 return nil 118 } 119 } 120 121 // NewNGramIndex is N-gram index c-tor. In most cases must be used withot parameters. 122 // You can pass parameters to c-tor using functions SetPad, SetWarp and SetN. 123 func NewNGramIndex(opts ...Option) (*NGramIndex, error) { 124 ngram := new(NGramIndex) 125 for _, opt := range opts { 126 if err := opt(ngram); err != nil { 127 return nil, err 128 } 129 } 130 ngram.init() 131 return ngram, nil 132 } 133 134 // Add token to index. Function returns token id, this id can be converted 135 // to string with function "GetString". 136 func (ngram *NGramIndex) Add(input string) (TokenID, error) { 137 if ngram.index == nil { 138 ngram.init() 139 } 140 results, error := ngram.splitInput(input) 141 if error != nil { 142 return -1, error 143 } 144 ixstr, error := ngram.spool.Append(input) 145 if error != nil { 146 return -1, error 147 } 148 for _, hash := range results { 149 ngram.Lock() 150 if ngram.index[hash] == nil { 151 ngram.index[hash] = make(map[TokenID]int) 152 } 153 // insert string and counter 154 ngram.index[hash][ixstr]++ 155 ngram.Unlock() 156 } 157 return ixstr, nil 158 } 159 160 // GetString converts token-id to string. 161 func (ngram *NGramIndex) GetString(id TokenID) (string, error) { 162 return ngram.spool.ReadAt(id) 163 } 164 165 // countNgrams maps matched tokens to the number of ngrams, shared with input string 166 func (ngram *NGramIndex) countNgrams(inputNgrams []uint32) map[TokenID]int { 167 counters := make(map[TokenID]int) 168 for _, ngramHash := range inputNgrams { 169 ngram.RLock() 170 for tok := range ngram.index[ngramHash] { 171 counters[tok]++ 172 } 173 ngram.RUnlock() 174 } 175 return counters 176 } 177 178 func validateThresholdValues(thresholds []float64) (float64, error) { 179 var tval float64 180 if len(thresholds) == 1 { 181 tval = thresholds[0] 182 if tval < 0.0 || tval > 1.0 { 183 return 0.0, errors.New("threshold must be in range (0, 1)") 184 } 185 } else if len(thresholds) > 1 { 186 return 0.0, errors.New("too many arguments") 187 } 188 return tval, nil 189 } 190 191 func (ngram *NGramIndex) match(input string, tval float64) ([]SearchResult, error) { 192 inputNgrams, error := ngram.splitInput(input) 193 if error != nil { 194 return nil, error 195 } 196 output := make([]SearchResult, 0) 197 tokenCount := ngram.countNgrams(inputNgrams) 198 for token, count := range tokenCount { 199 var sim float64 200 allngrams := float64(len(inputNgrams)) 201 matchngrams := float64(count) 202 if ngram.warp == 1.0 { 203 sim = matchngrams / allngrams 204 } else { 205 diffngrams := allngrams - matchngrams 206 sim = math.Pow(allngrams, ngram.warp) - math.Pow(diffngrams, ngram.warp) 207 sim /= math.Pow(allngrams, ngram.warp) 208 } 209 if sim >= tval { 210 res := SearchResult{Similarity: sim, TokenID: token} 211 output = append(output, res) 212 } 213 } 214 return output, nil 215 } 216 217 // Search for matches between query string (input) and indexed strings. 218 // First parameter - threshold is optional and can be used to set minimal similarity 219 // between input string and matching string. You can pass only one threshold value. 220 // Results is an unordered array of 'SearchResult' structs. This struct contains similarity 221 // value (float32 value from threshold to 1.0) and token-id. 222 func (ngram *NGramIndex) Search(input string, threshold ...float64) ([]SearchResult, error) { 223 if ngram.index == nil { 224 ngram.init() 225 } 226 tval, error := validateThresholdValues(threshold) 227 if error != nil { 228 return nil, error 229 } 230 return ngram.match(input, tval) 231 } 232 233 // BestMatch is the same as Search except that it's returning only one best result instead of all. 234 func (ngram *NGramIndex) BestMatch(input string, threshold ...float64) (*SearchResult, error) { 235 if ngram.index == nil { 236 ngram.init() 237 } 238 tval, error := validateThresholdValues(threshold) 239 if error != nil { 240 return nil, error 241 } 242 variants, error := ngram.match(input, tval) 243 if error != nil { 244 return nil, error 245 } 246 if len(variants) == 0 { 247 return nil, errors.New("no matches found") 248 } 249 var result SearchResult 250 maxsim := -1.0 251 for _, val := range variants { 252 if val.Similarity > maxsim { 253 maxsim = val.Similarity 254 result = val 255 } 256 } 257 return &result, nil 258 }