github.com/sajari/fuzzy@v1.0.0/fuzzy_test.go (about) 1 package fuzzy 2 3 import ( 4 "fmt" 5 "runtime" 6 "strings" 7 "sync" 8 "testing" 9 "time" 10 ) 11 12 var sampleEnglish []string 13 14 func init() { 15 sampleEnglish = SampleEnglish() 16 } 17 18 func TestSpelling(t *testing.T) { 19 model := NewModel() 20 21 // For testing only, this is not advisable on production 22 model.SetThreshold(1) 23 24 // Train multiple words simultaneously 25 words := []string{"bob", "your", "uncle", "dynamite", "delicate", "biggest", "big", "bigger", "aunty", "you're", "bob", "your"} 26 model.Train(words) 27 28 // Check Spelling 29 if model.SpellCheck("yor") != "your" { 30 t.Errorf("Spell check: Single char delete failed") 31 } 32 if model.SpellCheck("uncel") != "uncle" { 33 t.Errorf("Spell check: Single char transpose failed") 34 } 35 if model.SpellCheck("dynemite") != "dynamite" { 36 t.Errorf("Spell check: Single char swap failed") 37 } 38 if model.SpellCheck("dellicate") != "delicate" { 39 t.Errorf("Spell check: Single char insertion failed") 40 } 41 if model.SpellCheck("dellicade") != "delicate" { 42 t.Errorf("Spell check: Two char change failed") 43 } 44 } 45 46 func TestSpellingSuggestions(t *testing.T) { 47 model := NewModel() 48 49 // For testing only, this is not advisable on production 50 model.SetThreshold(1) 51 52 // Train multiple words simultaneously 53 words := []string{"bob", "your", "uncle", "dynamite", "delicate", "biggest", "big", "bigger", "aunty", "you're", "bob", "your"} 54 model.Train(words) 55 56 // Check Spelling 57 if model.SpellCheckSuggestions("yor", 2)[0] != "your" { 58 t.Errorf("Spell check: Single char delete failed") 59 } 60 if model.SpellCheckSuggestions("uncel", 2)[0] != "uncle" { 61 t.Errorf("Spell check: Single char transpose failed") 62 } 63 if model.SpellCheckSuggestions("dynemite", 2)[0] != "dynamite" { 64 t.Errorf("Spell check: Single char swap failed") 65 } 66 if model.SpellCheckSuggestions("dellicate", 2)[0] != "delicate" { 67 t.Errorf("Spell check: Single char insertion failed") 68 } 69 if model.SpellCheckSuggestions("dellicade", 2)[0] != "delicate" { 70 t.Errorf("Spell check: Two char change failed") 71 } 72 73 suggestions := model.SpellCheckSuggestions("bigge", 2) 74 if suggestions[0] != "bigger" { 75 t.Errorf("Spell check suggestions, Single char delete is closest") 76 } 77 if suggestions[1] != "biggest" { 78 t.Errorf("Spell check suggestions, Double char delete 2nd closest") 79 } 80 } 81 82 func TestSuggestions(t *testing.T) { 83 model := NewModel() 84 85 // For testing only, this is not advisable on production 86 model.SetThreshold(1) 87 88 // Train multiple words simultaneously 89 words := []string{"bob", "your", "uncle", "dynamite", "delicate", "biggest", "big", "bigger", "aunty", "you're"} 90 model.Train(words) 91 92 // Train word by word 93 model.TrainWord("single") 94 95 // Suggest completions 96 potential := model.Suggestions("bigge", false) 97 bigger := false 98 biggest := false 99 for _, term := range potential { 100 if term == "bigger" { 101 bigger = true 102 } 103 if term == "biggest" { 104 biggest = true 105 } 106 } 107 if !biggest || !bigger { 108 t.Errorf("Suggestions are missing values that should be there") 109 } 110 } 111 112 func TestManualTermAddition(t *testing.T) { 113 model := NewModel() 114 model.SetThreshold(4) 115 116 model.SetCount("elephant", 10, true) 117 118 if model.SpellCheck("elphant") != "elephant" { 119 t.Errorf("Spell check: manual term addition didn't work") 120 } 121 } 122 123 // Not exhaustive, but shows training and spell checks can run concurrently 124 func TestConcurrency(t *testing.T) { 125 cpu := runtime.NumCPU() 126 runtime.GOMAXPROCS(cpu) 127 model := NewModel() 128 129 piece := len(sampleEnglish) / cpu 130 131 var wg sync.WaitGroup 132 // Train concurrently 133 for i := 0; i < cpu; i++ { 134 wg.Add(1) 135 go func(i int) { 136 begin := i * piece 137 end := (i+1)*piece - 1 138 model.Train(sampleEnglish[begin:end]) 139 wg.Done() 140 }(i) 141 } 142 wg.Wait() 143 144 // Test concurrently 145 words := []string{"bob", "your", "uncle", "dynmite", "delidate", "bgigest", "bigr", "biger", "arnty", "you're"} 146 for i := 0; i < cpu; i++ { 147 wg.Add(1) 148 go func() { 149 for _, word := range words { 150 model.SpellCheck(word) 151 } 152 wg.Done() 153 }() 154 } 155 wg.Wait() 156 } 157 158 func TestColdInit(t *testing.T) { 159 model := NewModel() 160 _, err := model.Autocomplete("a") 161 if err != nil { 162 t.Errorf("Failed to init and autocomplete: %v", err) 163 } 164 } 165 166 // Accuracy test sets come from Peter Norvig's set 167 // The big.txt file is also from Peter Norvig's set. This helps to define a decent 168 // dictionary, although it is still missing some of the common words in the test sets 169 // We aim for > 60% correction success at a rate of > 5000Hz (single threaded) 170 func TestAccuracy(t *testing.T) { 171 const test2AccuracyThreshold = .60 172 173 tests1 := map[string]string{"access": "acess", "accessing": "accesing", "accommodation": "accomodation acommodation acomodation", "account": "acount", "address": "adress adres", "addressable": "addresable", "arranged": "aranged arrainged", 174 "arranging": "aranging", "arrangement": "arragment", "articles": "articals", 175 "aunt": "annt anut arnt", "auxiliary": "auxillary", "available": "avaible", 176 "awful": "awfall afful", "basically": "basicaly", "beginning": "begining", 177 "benefit": "benifit", "benefits": "benifits", "between": "beetween", "bicycle": "bicycal bycicle bycycle", "biscuits": "biscits biscutes biscuts bisquits buiscits buiscuts", "built": "biult", 178 "cake": "cak", "career": "carrer", 179 "cemetery": "cemetary semetary", "centrally": "centraly", "certain": "cirtain", 180 "challenges": "chalenges chalenges", "chapter": "chaper chaphter chaptur", 181 "choice": "choise", "choosing": "chosing", "clerical": "clearical", 182 "committee": "comittee", "compare": "compair", "completely": "completly", 183 "consider": "concider", "considerable": "conciderable", "contented": "contenpted contende contended contentid", "curtains": "cartains certans courtens cuaritains curtans curtians curtions", "decide": "descide", "decided": "descided", "definitely": "definately difinately", "definition": "defenition", 184 "definitions": "defenitions", "description": "discription", "desiccate": "desicate dessicate dessiccate", "diagrammatically": "diagrammaticaally", 185 "different": "diffrent", "driven": "dirven", "ecstasy": "exstacy ecstacy", 186 "embarrass": "embaras embarass", "establishing": "astablishing establising", 187 "experience": "experance experiance", "experiences": "experances", "extended": "extented", "extremely": "extreamly", "fails": "failes", "families": "familes", 188 "february": "febuary", "further": "futher", "gallery": "galery gallary gallerry gallrey", 189 "hierarchal": "hierachial", "hierarchy": "hierchy", "inconvenient": "inconvienient inconvient inconvinient", "independent": "independant independant", 190 "initial": "intial", "initials": "inetials inistals initails initals intials", 191 "juice": "guic juce jucie juise juse", "latest": "lates latets latiest latist", 192 "laugh": "lagh lauf laught lugh", "level": "leval", 193 "levels": "levals", "liaison": "liaision liason", "lieu": "liew", "literature": "litriture", "loans": "lones", "locally": "localy", "magnificent": "magnificnet magificent magnifcent magnifecent magnifiscant magnifisent magnificant", 194 "management": "managment", "meant": "ment", "minuscule": "miniscule", 195 "minutes": "muinets", "monitoring": "monitering", "necessary": "neccesary necesary neccesary necassary necassery neccasary", "occurrence": "occurence occurence", "often": "ofen offen offten ofton", "opposite": "opisite oppasite oppesite oppisit oppisite opposit oppossite oppossitte", "parallel": "paralel paralell parrallel parralell parrallell", "particular": "particulaur", 196 "perhaps": "perhapse", "personnel": "personnell", "planned": "planed", "poem": "poame", "poems": "poims pomes", "poetry": "poartry poertry poetre poety powetry", 197 "position": "possition", "possible": "possable", "pretend": "pertend protend prtend pritend", "problem": "problam proble promblem proplen", 198 "pronunciation": "pronounciation", "purple": "perple perpul poarple", 199 "questionnaire": "questionaire", "really": "realy relley relly", "receipt": "receit receite reciet recipt", "receive": "recieve", "refreshment": "reafreshment refreshmant refresment refressmunt", "remember": "rember remeber rememmer rermember", 200 "remind": "remine remined", "scarcely": "scarcly scarecly scarely scarsely", 201 "scissors": "scisors sissors", "separate": "seperate", 202 "singular": "singulaur", "someone": "somone", "sources": "sorces", "southern": "southen", "special": "speaical specail specal speical", "splendid": "spledid splended splened splended", "standardizing": "stanerdizing", "stomach": "stomac stomache stomec stumache", "supersede": "supercede superceed", "there": "ther", 203 "totally": "totaly", "transferred": "transfred", "transportability": "transportibility", "triangular": "triangulaur", "understand": "undersand undistand", 204 "unexpected": "unexpcted unexpeted unexspected", "unfortunately": "unfortunatly", "unique": "uneque", "useful": "usefull", "valuable": "valubale valuble", 205 "variable": "varable", "variant": "vairiant", "various": "vairious", 206 "visited": "fisited viseted vistid vistied", "visitors": "vistors", 207 "voluntary": "volantry", "voting": "voteing", "wanted": "wantid wonted", 208 "whether": "wether", "wrote": "rote wote"} 209 210 tests2 := map[string]string{"forbidden": "forbiden", "decisions": "deciscions descisions", 211 "supposedly": "supposidly", "embellishing": "embelishing", "technique": "tecnique", "permanently": "perminantly", "confirmation": "confermation", 212 "appointment": "appoitment", "progression": "progresion", "accompanying": "acompaning", "applicable": "aplicable", "regained": "regined", "guidelines": "guidlines", "surrounding": "serounding", "titles": "tittles", "unavailable": "unavailble", "advantageous": "advantageos", "brief": "brif", "appeal": "apeal", "consisting": "consisiting", "clerk": "cleark clerck", "component": "componant", "favourable": "faverable", "separation": "seperation", "search": "serch", "receive": "recieve", "employees": "emploies", "prior": "piror", 213 "resulting": "reulting", "suggestion": "sugestion", "opinion": "oppinion", 214 "cancellation": "cancelation", "criticism": "citisum", "useful": "usful", 215 "humour": "humor", "anomalies": "anomolies", "would": "whould", "doubt": "doupt", "examination": "eximination", "therefore": "therefoe", "recommend": "recomend", "separated": "seperated", "successful": "sucssuful succesful", 216 "apparent": "apparant", "occurred": "occureed", "particular": "paerticulaur", 217 "pivoting": "pivting", "announcing": "anouncing", "challenge": "chalange", 218 "arrangements": "araingements", "proportions": "proprtions", "organized": "oranised", "accept": "acept", "dependence": "dependance", "unequalled": "unequaled", "numbers": "numbuers", "sense": "sence", "conversely": "conversly", "provide": "provid", "arrangement": "arrangment", 219 "responsibilities": "responsiblities", "fourth": "forth", "ordinary": "ordenary", "description": "desription descvription desacription", 220 "inconceivable": "inconcievable", "data": "dsata", "register": "rgister", 221 "supervision": "supervison", "encompassing": "encompasing", "negligible": "negligable", "allow": "alow", "operations": "operatins", "executed": "executted", "interpretation": "interpritation", "hierarchy": "heiarky", 222 "indeed": "indead", "years": "yesars", "through": "throut", "committee": "committe", "inquiries": "equiries", "before": "befor", "continued": "contuned", "permanent": "perminant", "choose": "chose", "virtually": "vertually", "correspondence": "correspondance", "eventually": "eventully", 223 "lonely": "lonley", "profession": "preffeson", "they": "thay", "now": "noe", 224 "desperately": "despratly", "university": "unversity", "adjournment": "adjurnment", "possibilities": "possablities", "stopped": "stoped", "mean": "meen", "weighted": "wagted", "adequately": "adequattly", "shown": "hown", 225 "matrix": "matriiix", "profit": "proffit", "encourage": "encorage", "collate": "colate", "disaggregate": "disaggreagte disaggreaget", "receiving": "recieving reciving", "proviso": "provisoe", "umbrella": "umberalla", "approached": "aproached", "pleasant": "plesent", "difficulty": "dificulty", "appointments": "apointments", "base": "basse", "conditioning": "conditining", "earliest": "earlyest", "beginning": "begining", "universally": "universaly", 226 "unresolved": "unresloved", "length": "lengh", "exponentially": "exponentualy", "utilized": "utalised", "set": "et", "surveys": "servays", 227 "families": "familys", "system": "sysem", "approximately": "aproximatly", 228 "their": "ther", "scheme": "scheem", "speaking": "speeking", "repetitive": "repetative", "inefficient": "ineffiect", "geneva": "geniva", "exactly": "exsactly", "immediate": "imediate", "appreciation": "apreciation", "luckily": "luckeley", "eliminated": "elimiated", "believe": "belive", "appreciated": "apreciated", "readjusted": "reajusted", "were": "wer where", "feeling": "fealing", "and": "anf", "false": "faulse", "seen": "seeen", "interrogating": "interogationg", "academically": "academicly", "relatively": "relativly relitivly", 229 "traditionally": "traditionaly", "studying": "studing", 230 "majority": "majorty", "build": "biuld", "aggravating": "agravating", 231 "transactions": "trasactions", "arguing": "aurguing", "sheets": "sheertes", 232 "successive": "sucsesive sucessive", "segment": "segemnt", "especially": "especaily", "later": "latter", "senior": "sienior", "dragged": "draged", 233 "atmosphere": "atmospher", "drastically": "drasticaly", "particularly": "particulary", "visitor": "vistor", "session": "sesion", "continually": "contually", "availability": "avaiblity", "busy": "buisy", "parameters": "perametres", "surroundings": "suroundings seroundings", "employed": "emploied", "adequate": "adiquate", "handle": "handel", "means": "meens", 234 "familiar": "familer", "between": "beeteen", "overall": "overal", "timing": "timeing", "committees": "comittees commitees", "queries": "quies", 235 "econometric": "economtric", "erroneous": "errounous", "decides": "descides", 236 "reference": "refereence refference", "intelligence": "inteligence", 237 "edition": "ediion ediition", "are": "arte", "apologies": "appologies", 238 "thermawear": "thermawere thermawhere", "techniques": "tecniques", 239 "voluntary": "volantary", "subsequent": "subsequant subsiquent", "currently": "curruntly", "forecast": "forcast", "weapons": "wepons", "routine": "rouint", 240 "neither": "niether", "approach": "aproach", "available": "availble", 241 "recently": "reciently", "ability": "ablity", "nature": "natior", 242 "commercial": "comersial", "agencies": "agences", "however": "howeverr", 243 "suggested": "sugested", "career": "carear", "many": "mony", "annual": "anual", "according": "acording", "receives": "recives recieves", 244 "interesting": "intresting", "expense": "expence", "relevant": "relavent relevaant", "table": "tasble", "throughout": "throuout", "conference": "conferance", "sensible": "sensable", "described": "discribed describd", 245 "union": "unioun", "interest": "intrest", "flexible": "flexable", "refered": "reffered", "controlled": "controled", "sufficient": "suficient", 246 "dissension": "desention", "adaptable": "adabtable", "representative": "representitive", "irrelevant": "irrelavent", "unnecessarily": "unessasarily", 247 "applied": "upplied", "apologised": "appologised", "these": "thees thess", 248 "choices": "choises", "will": "wil", "procedure": "proceduer", "shortened": "shortend", "manually": "manualy", "disappointing": "dissapoiting", 249 "excessively": "exessively", "comments": "coments", "containing": "containg", 250 "develop": "develope", "credit": "creadit", "government": "goverment", 251 "acquaintances": "aquantences", "orientated": "orentated", "widely": "widly", 252 "advise": "advice", "difficult": "dificult", "investigated": "investegated", 253 "bonus": "bonas", "conceived": "concieved", "nationally": "nationaly", 254 "compared": "comppared compased", "moving": "moveing", "necessity": "nessesity", "opportunity": "oppertunity oppotunity opperttunity", "thoughts": "thorts", "equalled": "equaled", "variety": "variatry", "analysis": "analiss analsis analisis", "patterns": "pattarns", "qualities": "quaties", "easily": "easyly", "organization": "oranisation oragnisation", "the": "thw hte thi", 255 "corporate": "corparate", "composed": "compossed", "enormously": "enomosly", 256 "financially": "financialy", "functionally": "functionaly", "discipline": "disiplin", "announcement": "anouncement", "progresses": "progressess", 257 "except": "excxept", "recommending": "recomending", "mathematically": "mathematicaly", "source": "sorce", "combine": "comibine", "input": "inut", 258 "careers": "currers carrers", "resolved": "resoved", "demands": "diemands", 259 "unequivocally": "unequivocaly", "suffering": "suufering", "immediately": "imidatly imediatly", "accepted": "acepted", "projects": "projeccts", 260 "necessary": "necasery nessasary nessisary neccassary", "journalism": "journaism", "unnecessary": "unessessay", "night": "nite", "output": "oputput", "security": "seurity", "essential": "esential", "beneficial": "benificial benficial", "explaining": "explaning", "supplementary": "suplementary", "questionnaire": "questionare", "employment": "empolyment", 261 "proceeding": "proceding", "decision": "descisions descision", "per": "pere", 262 "discretion": "discresion", "reaching": "reching", "analysed": "analised", 263 "expansion": "expanion", "although": "athough", "subtract": "subtrcat", 264 "analysing": "aalysing", "comparison": "comparrison", "months": "monthes", 265 "hierarchal": "hierachial", "misleading": "missleading", "commit": "comit", 266 "auguments": "aurgument", "within": "withing", "obtaining": "optaning", 267 "accounts": "acounts", "primarily": "pimarily", "operator": "opertor", 268 "accumulated": "acumulated", "extremely": "extreemly", "there": "thear", 269 "summarys": "sumarys", "analyse": "analiss", "understandable": "understadable", "safeguard": "safegaurd", "consist": "consisit", 270 "declarations": "declaratrions", "minutes": "muinutes muiuets", "associated": "assosiated", "accessibility": "accessability", "examine": "examin", 271 "surveying": "servaying", "politics": "polatics", "annoying": "anoying", 272 "again": "agiin", "assessing": "accesing", "ideally": "idealy", "scrutinized": "scrutiniesed", "simular": "similar", "personnel": "personel", "whereas": "wheras", "when": "whn", "geographically": "goegraphicaly", "gaining": "ganing", "requested": "rquested", "separate": "seporate", "students": "studens", "prepared": "prepaired", "generated": "generataed", "graphically": "graphicaly", "suited": "suted", "variable": "varible vaiable", "building": "biulding", "required": "reequired", "necessitates": "nessisitates", 273 "together": "togehter", "profits": "proffits"} 274 275 model := NewModel() 276 model.SetThreshold(1) // This ensures a more complete dictionary at the expense of size/speed. 277 model.Train(sampleEnglish) 278 279 // Look at test sets 280 // SET 1 281 count, correct, incorrect := 0, 0, 0 282 t2 := time.Now() 283 for target, testwords := range tests1 { 284 testwordarr := strings.Split(testwords, " ") 285 for _, testword := range testwordarr { 286 if model.SpellCheck(testword) == target { 287 correct++ 288 } else { 289 incorrect++ 290 } 291 count++ 292 } 293 } 294 t3 := time.Now() 295 296 fmt.Printf("Spell test1 count: %v, Correct: %v, Incorrect: %v, Ratio: %f, Total time: %v \n\n", count, correct, incorrect, float32(correct)/float32(count), t3.Sub(t2)) 297 298 successrate := float32(correct) / float32(count) 299 if successrate < 0.60 { 300 t.Errorf("Unacceptable correction rate for set test1 (%v). e.g. below 60 percent.", successrate) 301 } 302 303 // 5000Hz is our aim 304 maxtime := time.Duration(count) * 200 * time.Microsecond 305 306 if t3.Sub(t2) > maxtime { 307 t.Errorf("Unacceptable completion time for set test1 (%v). e.g. %v corrections took greater than %v.", t3.Sub(t2), count, maxtime) 308 } 309 310 // SET 2 311 count, correct, incorrect = 0, 0, 0 312 t2 = time.Now() 313 for target, testwords := range tests2 { 314 testwordarr := strings.Split(testwords, " ") 315 for _, testword := range testwordarr { 316 if model.SpellCheck(testword) == target { 317 correct++ 318 } else { 319 incorrect++ 320 } 321 count++ 322 } 323 } 324 t3 = time.Now() 325 326 fmt.Printf("Spell test2 count: %v, Correct: %v, Incorrect: %v, Ratio: %f, Total time: %v \n\n", count, correct, incorrect, float32(correct)/float32(count), t3.Sub(t2)) 327 328 successrate = float32(correct) / float32(count) 329 if successrate < test2AccuracyThreshold { 330 t.Errorf("Unacceptable correction rate for set test2 (%v). e.g. below %v.", successrate, test2AccuracyThreshold) 331 } 332 333 // 5000Hz is our aim 334 maxtime = time.Duration(count) * 200 * time.Microsecond 335 336 if t3.Sub(t2) > maxtime { 337 t.Errorf("Unacceptable completion time for set test2 (%v). e.g. %v corrections took greater than %v", t3.Sub(t2), count, maxtime) 338 } 339 340 } 341 342 // Quick test to make sure we're picking up the right stuff 343 func TestAutocomplete(t *testing.T) { 344 model := NewModel() 345 model.Train(sampleEnglish) 346 out, err := model.Autocomplete("accoun") 347 if err != nil { 348 t.Errorf("Auocomplete() returned and error: ", err) 349 } 350 expected := map[string]bool{ 351 "account": true, 352 "accountant": true, 353 "accounts": true, 354 "accounted": true, 355 } 356 for _, m := range out { 357 if val, ok := expected[m]; !ok { 358 t.Errorf("Expected to find %v (%v), but didn't", m, val) 359 } 360 } 361 } 362 363 // Test to ensure query training begins to dominate over 364 // corpus training when autocompleting 365 func TestAutocompleteFromQueries(t *testing.T) { 366 model := NewModel() 367 // Changing defaults for testing only, this is not advisable on production 368 model.SetThreshold(1) 369 model.SetDivergenceThreshold(1) 370 371 model.Train([]string{"every", "every", "every", "every", "every", "every", "everest", "eveready", "eveready", "everything", "everything"}) 372 model.TrainQuery("everest") // Simulate a query 373 model.TrainQuery("everest") // Simulate a query 374 model.TrainQuery("eveready") // Simulate a query 375 376 out, err := model.Autocomplete("eve") 377 if err != nil { 378 t.Errorf("Auocomplete() returned and error: ", err) 379 } 380 if out[0] != "everest" { 381 t.Errorf("Autocomplete failed to account for query training") 382 } 383 if out[1] != "eveready" { 384 t.Errorf("Autocomplete failed to account for query training") 385 } 386 } 387 388 func TestLoadOldModel(t *testing.T) { 389 if _, err := Load("data/test.dict"); err != nil { 390 t.Errorf("Couldn't load old model format: %v", err) 391 } 392 }