github.com/fluhus/gostuff@v0.4.1-0.20240331134726-be71864f2b5d/nlp/wordnet/wordnet.go

github.com/fluhus/gostuff@v0.4.1-0.20240331134726-be71864f2b5d/nlp/wordnet/wordnet.go (about)

     1  // Package wordnet provides a WordNet parser and interface.
     2  //
     3  // # Basic usage
     4  //
     5  // The main entry point is the WordNet type. It holds all the data of a
     6  // WordNet dictionary, and provides search methods.
     7  //
     8  // To search for the noun meanings of 'cat':
     9  //
    10  //	wn, _ := wordnet.Parse(...)
    11  //	catNouns := wn.Search("cat")["n"]
    12  //	// = slice of all synsets that contain the word "cat" and are nouns.
    13  //
    14  // To calculate similarity between words:
    15  //
    16  //	wn, _ := wordnet.Parse(...)
    17  //	cat := wn.Search("cat")["n"][0]
    18  //	dog := wn.Search("dog")["n"][0]
    19  //	similarity := wn.PathSimilarity(cat, dog, false)
    20  //	// = 0.2
    21  //
    22  // To get usage examples for verbs:
    23  //
    24  //	wn, _ := wordnet.Parse(...)
    25  //	eat := wn.Search("eat")["v"][1]
    26  //	examples := wn.Examples(eat)
    27  //	// = string slice of examples for the words in the 'eat' synset.
    28  //
    29  // # Parts of speech
    30  //
    31  // Some data refers to parts of speech (POS). Everywhere a part of speech is
    32  // expected, it is a single letter as follows:
    33  //
    34  //	a: adjective
    35  //	n: noun
    36  //	r: adverb
    37  //	v: verb
    38  //
    39  // # Citation
    40  //
    41  // This API is based on: Princeton University "About WordNet." WordNet.
    42  // Princeton University. 2010. http://wordnet.princeton.edu
    43  //
    44  // Please cite them if you use this API.
    45  package wordnet
    46  
    47  import (
    48  	"fmt"
    49  	"math"
    50  	"sort"
    51  	"strings"
    52  )
    53  
    54  // Parse parses an entire WordNet directory. Path is the root of the directory.
    55  // The parser will trverse it and parse the required files, assuming
    56  // directory structure is as published.
    57  func Parse(path string) (*WordNet, error) {
    58  	result := &WordNet{}
    59  	var err error
    60  
    61  	result.Example, err = parseExampleFile(path)
    62  	if err != nil {
    63  		// Older versions of the database don't have examples, so skipping if
    64  		// not found.
    65  		result.Example = map[string]string{}
    66  	}
    67  
    68  	examples, err := parseExampleIndexFile(path)
    69  	if err != nil {
    70  		// Older versions of the database don't have examples, so skipping if
    71  		// not found.
    72  		examples = map[string][]int{}
    73  	}
    74  
    75  	result.Synset, err = parseDataFiles(path, examples)
    76  	if err != nil {
    77  		return nil, err
    78  	}
    79  
    80  	result.Exception, err = parseExceptionFiles(path)
    81  	if err != nil {
    82  		// Older versions of the database don't have exceptions, so skipping if
    83  		// not found.
    84  		result.Exception = map[string][]string{}
    85  	}
    86  
    87  	result.indexLemma()
    88  
    89  	result.LemmaRanked, err = parseIndexFiles(path)
    90  	if err != nil {
    91  		return nil, err
    92  	}
    93  
    94  	return result, nil
    95  }
    96  
    97  // Search searches for a word in the dictionary. Returns a map from part of
    98  // speech (a, n, r, v) to all synsets that contain that word.
    99  func (wn *WordNet) Search(word string) map[string][]*Synset {
   100  	result := map[string][]*Synset{}
   101  	for _, pos := range [...]string{"a", "n", "r", "v"} {
   102  		ids := wn.Lemma[pos+word]
   103  		result[pos] = make([]*Synset, len(ids))
   104  		for i, id := range ids {
   105  			result[pos][i] = wn.Synset[id]
   106  		}
   107  	}
   108  	// TODO(amit): Search in exceptions too?
   109  	return result
   110  }
   111  
   112  // SearchRanked searches for a word in the dictionary. Returns a map from part
   113  // of speech (a, n, r, v) to synsets that contain that word, ranked from the
   114  // most frequently used to the least.
   115  //
   116  // Only a subset of the synsets are ranked so this may return less synsets than
   117  // what Search would have.
   118  func (wn *WordNet) SearchRanked(word string) map[string][]*Synset {
   119  	result := map[string][]*Synset{}
   120  	for _, pos := range [...]string{"a", "n", "r", "v"} {
   121  		ids := wn.LemmaRanked[pos+"."+word]
   122  		result[pos] = make([]*Synset, len(ids))
   123  		for i, id := range ids {
   124  			result[pos][i] = wn.Synset[id]
   125  		}
   126  	}
   127  	// TODO(amit): Search in exceptions too?
   128  	return result
   129  }
   130  
   131  // PathSimilarity returns a score denoting how similar two word senses are,
   132  // based on the shortest path that connects the senses in the is-a
   133  // (hypernym/hypnoym) taxonomy. The score is in the range 0 to 1, where 1 means
   134  // identity and 0 means completely disjoint.
   135  //
   136  // If simulateRoot is true, will create a common fake root for the top of each
   137  // synset's hierarchy if no common ancestor was found.
   138  //
   139  // Based on NLTK's path_similarity function.
   140  func (wn *WordNet) PathSimilarity(from, to *Synset, simulateRoot bool) float64 {
   141  	hypFrom := wn.hypernyms(from)
   142  	hypTo := wn.hypernyms(to)
   143  	shortest := math.MaxInt32
   144  
   145  	// Find common ancestor that gives the shortest path.
   146  	for s := range hypFrom {
   147  		if _, ok := hypTo[s]; ok {
   148  			distance := hypFrom[s] + hypTo[s]
   149  			if distance < shortest {
   150  				shortest = distance
   151  			}
   152  		}
   153  	}
   154  
   155  	// If no common ancestor, make a fake root.
   156  	if shortest == math.MaxInt32 {
   157  		if simulateRoot {
   158  			depthFrom := maxSynsetDistance(hypFrom)
   159  			depthTo := maxSynsetDistance(hypTo)
   160  			shortest = depthFrom + depthTo + 2 // 2 for fake root.
   161  		} else {
   162  			return 0
   163  		}
   164  	}
   165  
   166  	return 1.0 / float64(shortest+1)
   167  }
   168  
   169  // WupSimilarity is Wu-Palmer Similarity. Returns a score denoting how similar
   170  // two word senses are, based on the depth of the two senses in the taxonomy
   171  // and that of their Least Common Subsumer (most specific ancestor node).
   172  //
   173  // If simulateRoot is true, will create a common fake root for the top of each
   174  // synset's hierarchy if no common ancestor was found.
   175  //
   176  // Based on NLTK's wup_similarity function.
   177  func (wn *WordNet) WupSimilarity(from, to *Synset, simulateRoot bool) float64 {
   178  	hypFrom := wn.hypernyms(from)
   179  	hypTo := wn.hypernyms(to)
   180  	var ancestor *Synset
   181  
   182  	// Find deepest common ancestor.
   183  	for s := range hypFrom {
   184  		if _, ok := hypTo[s]; ok {
   185  			if ancestor == nil || hypFrom[s] < hypFrom[ancestor] {
   186  				ancestor = s
   187  			}
   188  		}
   189  	}
   190  
   191  	var depthFrom, depthTo, depthAncestor int
   192  
   193  	if ancestor != nil {
   194  		depthAncestor = maxSynsetDistance(wn.hypernyms(ancestor)) + 1
   195  		depthFrom = depthAncestor + hypFrom[ancestor]
   196  		depthTo = depthAncestor + hypTo[ancestor]
   197  	} else {
   198  		// If no common ancestor, make a fake root.
   199  		if simulateRoot {
   200  			depthFrom = maxSynsetDistance(hypFrom) + 1
   201  			depthTo = maxSynsetDistance(hypTo) + 1
   202  			depthAncestor = 1
   203  		} else {
   204  			return 0
   205  		}
   206  	}
   207  
   208  	return 2.0 * float64(depthAncestor) / float64(depthFrom+depthTo)
   209  }
   210  
   211  // Returns the hypernym hierarchy of the synset, with their distance from the
   212  // input synset.
   213  func (wn *WordNet) hypernyms(ss *Synset) map[*Synset]int {
   214  	result := map[*Synset]int{}
   215  	next := map[*Synset]struct{}{ss: {}}
   216  	level := 0
   217  	for len(next) > 0 {
   218  		newNext := map[*Synset]struct{}{}
   219  		for s := range next {
   220  			result[s] = level
   221  			for _, ptr := range s.Pointer {
   222  				if ptr.Symbol[:1] == Hypernym {
   223  					if _, ok := result[wn.Synset[ptr.Synset]]; !ok {
   224  						newNext[wn.Synset[ptr.Synset]] = struct{}{}
   225  					}
   226  				}
   227  			}
   228  		}
   229  		level++
   230  		next = newNext
   231  	}
   232  
   233  	return result
   234  }
   235  
   236  // Returns the maximal value from the given map.
   237  func maxSynsetDistance(m map[*Synset]int) int {
   238  	result := 0
   239  	for _, d := range m {
   240  		if d > result {
   241  			result = d
   242  		}
   243  	}
   244  	return result
   245  }
   246  
   247  // Indexes all words in the data.
   248  func (wn *WordNet) indexLemma() {
   249  	wn.Lemma = map[string][]string{}
   250  
   251  	// Sort synsets to keep index stable.
   252  	ids := make([]string, 0, len(wn.Synset))
   253  	for id := range wn.Synset {
   254  		ids = append(ids, id)
   255  	}
   256  	sort.Strings(ids)
   257  
   258  	for _, id := range ids {
   259  		ss := wn.Synset[id]
   260  		pos := id[0:1]
   261  		for _, word := range ss.Word {
   262  			w := pos + strings.ToLower(word)
   263  			wn.Lemma[w] = append(wn.Lemma[w], id)
   264  		}
   265  	}
   266  }
   267  
   268  // Examples returns usage examples for the given synset. Always empty for
   269  // non-verbs.
   270  func (wn *WordNet) Examples(ss *Synset) []string {
   271  	result := make([]string, len(ss.Example))
   272  	for i := range result {
   273  		template := wn.Example[fmt.Sprint(ss.Example[i].TemplateNumber)]
   274  		word := ss.Word[ss.Example[i].WordNumber]
   275  		result[i] = fmt.Sprintf(template, word)
   276  	}
   277  	return result
   278  }
   279  
   280  // Id returns the synset's ID, for example n123456. Equals the concatenation of
   281  // POS and offset.
   282  func (ss *Synset) Id() string {
   283  	return fmt.Sprintf("%v%v", ss.Pos, ss.Offset)
   284  }