github.com/fluhus/gostuff@v0.4.1-0.20240331134726-be71864f2b5d/nlp/wordnet/wordnet.go (about) 1 // Package wordnet provides a WordNet parser and interface. 2 // 3 // # Basic usage 4 // 5 // The main entry point is the WordNet type. It holds all the data of a 6 // WordNet dictionary, and provides search methods. 7 // 8 // To search for the noun meanings of 'cat': 9 // 10 // wn, _ := wordnet.Parse(...) 11 // catNouns := wn.Search("cat")["n"] 12 // // = slice of all synsets that contain the word "cat" and are nouns. 13 // 14 // To calculate similarity between words: 15 // 16 // wn, _ := wordnet.Parse(...) 17 // cat := wn.Search("cat")["n"][0] 18 // dog := wn.Search("dog")["n"][0] 19 // similarity := wn.PathSimilarity(cat, dog, false) 20 // // = 0.2 21 // 22 // To get usage examples for verbs: 23 // 24 // wn, _ := wordnet.Parse(...) 25 // eat := wn.Search("eat")["v"][1] 26 // examples := wn.Examples(eat) 27 // // = string slice of examples for the words in the 'eat' synset. 28 // 29 // # Parts of speech 30 // 31 // Some data refers to parts of speech (POS). Everywhere a part of speech is 32 // expected, it is a single letter as follows: 33 // 34 // a: adjective 35 // n: noun 36 // r: adverb 37 // v: verb 38 // 39 // # Citation 40 // 41 // This API is based on: Princeton University "About WordNet." WordNet. 42 // Princeton University. 2010. http://wordnet.princeton.edu 43 // 44 // Please cite them if you use this API. 45 package wordnet 46 47 import ( 48 "fmt" 49 "math" 50 "sort" 51 "strings" 52 ) 53 54 // Parse parses an entire WordNet directory. Path is the root of the directory. 55 // The parser will trverse it and parse the required files, assuming 56 // directory structure is as published. 57 func Parse(path string) (*WordNet, error) { 58 result := &WordNet{} 59 var err error 60 61 result.Example, err = parseExampleFile(path) 62 if err != nil { 63 // Older versions of the database don't have examples, so skipping if 64 // not found. 65 result.Example = map[string]string{} 66 } 67 68 examples, err := parseExampleIndexFile(path) 69 if err != nil { 70 // Older versions of the database don't have examples, so skipping if 71 // not found. 72 examples = map[string][]int{} 73 } 74 75 result.Synset, err = parseDataFiles(path, examples) 76 if err != nil { 77 return nil, err 78 } 79 80 result.Exception, err = parseExceptionFiles(path) 81 if err != nil { 82 // Older versions of the database don't have exceptions, so skipping if 83 // not found. 84 result.Exception = map[string][]string{} 85 } 86 87 result.indexLemma() 88 89 result.LemmaRanked, err = parseIndexFiles(path) 90 if err != nil { 91 return nil, err 92 } 93 94 return result, nil 95 } 96 97 // Search searches for a word in the dictionary. Returns a map from part of 98 // speech (a, n, r, v) to all synsets that contain that word. 99 func (wn *WordNet) Search(word string) map[string][]*Synset { 100 result := map[string][]*Synset{} 101 for _, pos := range [...]string{"a", "n", "r", "v"} { 102 ids := wn.Lemma[pos+word] 103 result[pos] = make([]*Synset, len(ids)) 104 for i, id := range ids { 105 result[pos][i] = wn.Synset[id] 106 } 107 } 108 // TODO(amit): Search in exceptions too? 109 return result 110 } 111 112 // SearchRanked searches for a word in the dictionary. Returns a map from part 113 // of speech (a, n, r, v) to synsets that contain that word, ranked from the 114 // most frequently used to the least. 115 // 116 // Only a subset of the synsets are ranked so this may return less synsets than 117 // what Search would have. 118 func (wn *WordNet) SearchRanked(word string) map[string][]*Synset { 119 result := map[string][]*Synset{} 120 for _, pos := range [...]string{"a", "n", "r", "v"} { 121 ids := wn.LemmaRanked[pos+"."+word] 122 result[pos] = make([]*Synset, len(ids)) 123 for i, id := range ids { 124 result[pos][i] = wn.Synset[id] 125 } 126 } 127 // TODO(amit): Search in exceptions too? 128 return result 129 } 130 131 // PathSimilarity returns a score denoting how similar two word senses are, 132 // based on the shortest path that connects the senses in the is-a 133 // (hypernym/hypnoym) taxonomy. The score is in the range 0 to 1, where 1 means 134 // identity and 0 means completely disjoint. 135 // 136 // If simulateRoot is true, will create a common fake root for the top of each 137 // synset's hierarchy if no common ancestor was found. 138 // 139 // Based on NLTK's path_similarity function. 140 func (wn *WordNet) PathSimilarity(from, to *Synset, simulateRoot bool) float64 { 141 hypFrom := wn.hypernyms(from) 142 hypTo := wn.hypernyms(to) 143 shortest := math.MaxInt32 144 145 // Find common ancestor that gives the shortest path. 146 for s := range hypFrom { 147 if _, ok := hypTo[s]; ok { 148 distance := hypFrom[s] + hypTo[s] 149 if distance < shortest { 150 shortest = distance 151 } 152 } 153 } 154 155 // If no common ancestor, make a fake root. 156 if shortest == math.MaxInt32 { 157 if simulateRoot { 158 depthFrom := maxSynsetDistance(hypFrom) 159 depthTo := maxSynsetDistance(hypTo) 160 shortest = depthFrom + depthTo + 2 // 2 for fake root. 161 } else { 162 return 0 163 } 164 } 165 166 return 1.0 / float64(shortest+1) 167 } 168 169 // WupSimilarity is Wu-Palmer Similarity. Returns a score denoting how similar 170 // two word senses are, based on the depth of the two senses in the taxonomy 171 // and that of their Least Common Subsumer (most specific ancestor node). 172 // 173 // If simulateRoot is true, will create a common fake root for the top of each 174 // synset's hierarchy if no common ancestor was found. 175 // 176 // Based on NLTK's wup_similarity function. 177 func (wn *WordNet) WupSimilarity(from, to *Synset, simulateRoot bool) float64 { 178 hypFrom := wn.hypernyms(from) 179 hypTo := wn.hypernyms(to) 180 var ancestor *Synset 181 182 // Find deepest common ancestor. 183 for s := range hypFrom { 184 if _, ok := hypTo[s]; ok { 185 if ancestor == nil || hypFrom[s] < hypFrom[ancestor] { 186 ancestor = s 187 } 188 } 189 } 190 191 var depthFrom, depthTo, depthAncestor int 192 193 if ancestor != nil { 194 depthAncestor = maxSynsetDistance(wn.hypernyms(ancestor)) + 1 195 depthFrom = depthAncestor + hypFrom[ancestor] 196 depthTo = depthAncestor + hypTo[ancestor] 197 } else { 198 // If no common ancestor, make a fake root. 199 if simulateRoot { 200 depthFrom = maxSynsetDistance(hypFrom) + 1 201 depthTo = maxSynsetDistance(hypTo) + 1 202 depthAncestor = 1 203 } else { 204 return 0 205 } 206 } 207 208 return 2.0 * float64(depthAncestor) / float64(depthFrom+depthTo) 209 } 210 211 // Returns the hypernym hierarchy of the synset, with their distance from the 212 // input synset. 213 func (wn *WordNet) hypernyms(ss *Synset) map[*Synset]int { 214 result := map[*Synset]int{} 215 next := map[*Synset]struct{}{ss: {}} 216 level := 0 217 for len(next) > 0 { 218 newNext := map[*Synset]struct{}{} 219 for s := range next { 220 result[s] = level 221 for _, ptr := range s.Pointer { 222 if ptr.Symbol[:1] == Hypernym { 223 if _, ok := result[wn.Synset[ptr.Synset]]; !ok { 224 newNext[wn.Synset[ptr.Synset]] = struct{}{} 225 } 226 } 227 } 228 } 229 level++ 230 next = newNext 231 } 232 233 return result 234 } 235 236 // Returns the maximal value from the given map. 237 func maxSynsetDistance(m map[*Synset]int) int { 238 result := 0 239 for _, d := range m { 240 if d > result { 241 result = d 242 } 243 } 244 return result 245 } 246 247 // Indexes all words in the data. 248 func (wn *WordNet) indexLemma() { 249 wn.Lemma = map[string][]string{} 250 251 // Sort synsets to keep index stable. 252 ids := make([]string, 0, len(wn.Synset)) 253 for id := range wn.Synset { 254 ids = append(ids, id) 255 } 256 sort.Strings(ids) 257 258 for _, id := range ids { 259 ss := wn.Synset[id] 260 pos := id[0:1] 261 for _, word := range ss.Word { 262 w := pos + strings.ToLower(word) 263 wn.Lemma[w] = append(wn.Lemma[w], id) 264 } 265 } 266 } 267 268 // Examples returns usage examples for the given synset. Always empty for 269 // non-verbs. 270 func (wn *WordNet) Examples(ss *Synset) []string { 271 result := make([]string, len(ss.Example)) 272 for i := range result { 273 template := wn.Example[fmt.Sprint(ss.Example[i].TemplateNumber)] 274 word := ss.Word[ss.Example[i].WordNumber] 275 result[i] = fmt.Sprintf(template, word) 276 } 277 return result 278 } 279 280 // Id returns the synset's ID, for example n123456. Equals the concatenation of 281 // POS and offset. 282 func (ss *Synset) Id() string { 283 return fmt.Sprintf("%v%v", ss.Pos, ss.Offset) 284 }