github.com/fluhus/gostuff@v0.4.1-0.20240331134726-be71864f2b5d/nlp/wordnet/parser.go (about)

     1  package wordnet
     2  
     3  import (
     4  	"bufio"
     5  	"fmt"
     6  	"io"
     7  	"os"
     8  	"path/filepath"
     9  	"strconv"
    10  	"strings"
    11  )
    12  
    13  // TODO(amit): Convenience functions for pointers?
    14  
    15  // ----- FILE LISTS -----------------------------------------------------------
    16  
    17  var (
    18  	dataFiles = map[string]string{
    19  		"data.adj":  "a",
    20  		"data.adv":  "r",
    21  		"data.noun": "n",
    22  		"data.verb": "v",
    23  	}
    24  	exceptionFiles = map[string]string{
    25  		"adj.exc":  "a",
    26  		"adv.exc":  "r",
    27  		"noun.exc": "n",
    28  		"verb.exc": "v",
    29  	}
    30  	indexFiles = []string{
    31  		"index.adj",
    32  		"index.adv",
    33  		"index.noun",
    34  		"index.verb",
    35  	}
    36  	exampleFile      = "sents.vrb"
    37  	exampleIndexFile = "sentidx.vrb"
    38  )
    39  
    40  // ----- LEMMA INDEX PARSING --------------------------------------------------
    41  
    42  // Parses the index files.
    43  func parseIndexFiles(path string) (map[string][]string, error) {
    44  	result := map[string][]string{}
    45  
    46  	for _, file := range indexFiles {
    47  		// Read index file.
    48  		f, err := os.Open(filepath.Join(path, file))
    49  		if err != nil {
    50  			return nil, fmt.Errorf("%v: %v", file, err)
    51  		}
    52  		m, err := parseIndex(f)
    53  		f.Close()
    54  		if err != nil {
    55  			return nil, fmt.Errorf("%v: %v", file, err)
    56  		}
    57  
    58  		// Merge index with result.
    59  		for lemma := range m {
    60  			result[lemma] = m[lemma]
    61  		}
    62  	}
    63  
    64  	return result, nil
    65  }
    66  
    67  // Parses the contents of an index file.
    68  func parseIndex(r io.Reader) (map[string][]string, error) {
    69  	result := map[string][]string{}
    70  	scanner := bufio.NewScanner(r)
    71  
    72  	lineNum := 0
    73  	for scanner.Scan() {
    74  		lineNum++
    75  		if strings.HasPrefix(scanner.Text(), "  ") { // Copyright line.
    76  			continue
    77  		}
    78  
    79  		line, err := parseIndexLine(scanner.Text())
    80  		if err != nil {
    81  			return nil, fmt.Errorf("Line %d: %v", lineNum, err)
    82  		}
    83  
    84  		if len(line.synset) == 1 {
    85  			line.ranked = 1
    86  		}
    87  		for i := range line.synset {
    88  			line.synset[i] = line.pos + line.synset[i]
    89  		}
    90  		if line.ranked > 0 {
    91  			result[line.pos+"."+line.lemma] = line.synset[:line.ranked]
    92  		}
    93  	}
    94  
    95  	return result, nil
    96  }
    97  
    98  // A single line in an index file.
    99  type indexLine struct {
   100  	lemma  string
   101  	pos    string
   102  	ptr    []string
   103  	synset []string
   104  	ranked int
   105  }
   106  
   107  // Parses an index file line.
   108  func parseIndexLine(line string) (*indexLine, error) {
   109  	result := &indexLine{}
   110  	parts := strings.Split(strings.Trim(line, " "), " ")
   111  
   112  	if len(parts) < 7 {
   113  		return nil, fmt.Errorf("bad number of parts: %d, expected at least 7",
   114  			len(parts))
   115  	}
   116  
   117  	result.lemma = parts[0]
   118  	result.pos = parts[1]
   119  
   120  	synsetCount, err := parseDeciUint(parts[2])
   121  	if err != nil {
   122  		return nil, fmt.Errorf("bad synset count: %s", parts[2])
   123  	}
   124  	ptrCount, err := parseDeciUint(parts[3])
   125  	if err != nil {
   126  		return nil, fmt.Errorf("bad pointer count: %s", parts[3])
   127  	}
   128  
   129  	parts = parts[4:]
   130  	if len(parts) < ptrCount+2+synsetCount {
   131  		return nil, fmt.Errorf("bad number of parts: %d, expected %d",
   132  			len(parts)+4, ptrCount+synsetCount+6)
   133  	}
   134  
   135  	result.ptr = parts[:ptrCount]
   136  	parts = parts[ptrCount:]
   137  
   138  	result.ranked, err = parseDeciUint(parts[1])
   139  	if err != nil {
   140  		return nil, fmt.Errorf("Bad tagsense count: %s", parts[1])
   141  	}
   142  
   143  	result.synset = parts[2:]
   144  	if result.ranked > len(result.synset) {
   145  		return nil, fmt.Errorf("Bad tagsense-count: %d is greated than "+
   146  			"synset count %d.", result.ranked, len(result.synset))
   147  	}
   148  
   149  	return result, nil
   150  }
   151  
   152  // ----- VERB EXAMPLE PARSING -------------------------------------------------
   153  
   154  // Parses the verb example file.
   155  func parseExampleFile(path string) (map[string]string, error) {
   156  	f, err := os.Open(filepath.Join(path, exampleFile))
   157  	if err != nil {
   158  		return nil, fmt.Errorf("%s: %v", exampleFile, err)
   159  	}
   160  	defer f.Close()
   161  	return parseExamples(f)
   162  }
   163  
   164  // Parses a verb example file.
   165  func parseExamples(r io.Reader) (map[string]string, error) {
   166  	result := map[string]string{}
   167  	scanner := bufio.NewScanner(r)
   168  
   169  	lineNum := 0
   170  	for scanner.Scan() {
   171  		lineNum++
   172  		parts := strings.Split(scanner.Text(), " ")
   173  		if len(parts) == 0 {
   174  			return nil, fmt.Errorf("line %d: No data to parse", lineNum)
   175  		}
   176  		_, err := parseDeciUint(parts[0])
   177  		if err != nil {
   178  			return nil, fmt.Errorf("line %d: %v", lineNum, err)
   179  		}
   180  		result[parts[0]] = strings.Join(parts[1:], " ")
   181  	}
   182  
   183  	return result, nil
   184  }
   185  
   186  // Parses the verb example index file.
   187  func parseExampleIndexFile(path string) (map[string][]int, error) {
   188  	f, err := os.Open(filepath.Join(path, exampleIndexFile))
   189  	if err != nil {
   190  		return nil, fmt.Errorf("%s: %v", exampleIndexFile, err)
   191  	}
   192  	defer f.Close()
   193  	return parseExampleIndex(f)
   194  }
   195  
   196  // Parses an entire verb example index file.
   197  func parseExampleIndex(r io.Reader) (map[string][]int, error) {
   198  	result := map[string][]int{}
   199  	scanner := bufio.NewScanner(r)
   200  
   201  	lineNum := 0
   202  	for scanner.Scan() {
   203  		lineNum++
   204  		raw, err := parseExampleIndexLine(scanner.Text())
   205  		if err != nil {
   206  			return nil, fmt.Errorf("line %d: %v", lineNum, err)
   207  		}
   208  		key := fmt.Sprintf("%s.%d.%d", raw.lemma, raw.lexFileNum, raw.lexId)
   209  		result[key] = raw.exampleIds
   210  	}
   211  
   212  	if scanner.Err() != nil {
   213  		return nil, scanner.Err()
   214  	}
   215  
   216  	return result, nil
   217  }
   218  
   219  // Represents a single line in the verb example index file.
   220  type rawExampleIndex struct {
   221  	lemma      string
   222  	pos        int
   223  	lexFileNum int
   224  	lexId      int
   225  	headWord   string
   226  	headId     int
   227  	exampleIds []int
   228  }
   229  
   230  // Parses a single line in the lemma-example index file.
   231  func parseExampleIndexLine(line string) (*rawExampleIndex, error) {
   232  	result := &rawExampleIndex{}
   233  	parts := strings.Split(line, " ")
   234  	if len(parts) != 2 {
   235  		return nil, fmt.Errorf("bad number of parts: %d, expected 2",
   236  			len(parts))
   237  	}
   238  
   239  	// Parse sense.
   240  	senseParts := strings.Split(parts[0], "%")
   241  	if len(senseParts) != 2 {
   242  		return nil, fmt.Errorf("bad number of sense-key parts: %d, expected"+
   243  			" 2", len(senseParts))
   244  	}
   245  
   246  	result.lemma = senseParts[0]
   247  	lexSenseParts := strings.Split(senseParts[1], ":")
   248  	if len(lexSenseParts) != 5 {
   249  		return nil, fmt.Errorf("bad number of lex-sense parts: %d, expected"+
   250  			" 5", len(lexSenseParts))
   251  	}
   252  
   253  	// Parse lex-sense.
   254  	var err error
   255  	result.pos, err = parseDeciUint(lexSenseParts[0])
   256  	if err != nil {
   257  		return nil, err
   258  	}
   259  	result.lexFileNum, err = parseDeciUint(lexSenseParts[1])
   260  	if err != nil {
   261  		return nil, err
   262  	}
   263  	result.lexId, err = parseDeciUint(lexSenseParts[2])
   264  	if err != nil {
   265  		return nil, err
   266  	}
   267  	result.headWord = lexSenseParts[3]
   268  	if result.headWord != "" {
   269  		result.headId, err = parseDeciUint(lexSenseParts[4])
   270  		if err != nil {
   271  			return nil, err
   272  		}
   273  	}
   274  
   275  	// Parse example numbers.
   276  	if parts[1] != "" {
   277  		numParts := strings.Split(parts[1], ",")
   278  		nums := make([]int, len(numParts))
   279  		for i := range numParts {
   280  			nums[i], err = parseDeciUint(numParts[i])
   281  			if err != nil {
   282  				return nil, err
   283  			}
   284  		}
   285  		result.exampleIds = nums
   286  	}
   287  
   288  	return result, nil
   289  }
   290  
   291  // ----- EXCEPTION PARSING ----------------------------------------------------
   292  
   293  func parseExceptionFiles(path string) (map[string][]string, error) {
   294  	result := map[string][]string{}
   295  	for file, pos := range exceptionFiles {
   296  		f, err := os.Open(filepath.Join(path, file))
   297  		if err != nil {
   298  			return nil, fmt.Errorf("%s: %v", file, err)
   299  		}
   300  		err = parseExceptionFile(f, pos, result)
   301  		f.Close()
   302  		if err != nil {
   303  			return nil, fmt.Errorf("%s: %v", file, err)
   304  		}
   305  	}
   306  	return result, nil
   307  }
   308  
   309  // Parses a single exception file. Adds keys to out that point to already
   310  // existing values.
   311  func parseExceptionFile(in io.Reader, pos string, out map[string][]string,
   312  ) error {
   313  	scanner := bufio.NewScanner(in)
   314  
   315  	// For each line.
   316  	lineNum := 0
   317  	for scanner.Scan() {
   318  		lineNum++
   319  		line := scanner.Text()
   320  		parts := strings.Split(line, " ")
   321  		if len(parts) < 2 {
   322  			return fmt.Errorf("line %d: Bad number of fields: %d, expected 2",
   323  				lineNum, len(parts))
   324  		}
   325  
   326  		for i := range parts {
   327  			parts[i] = pos + "." + parts[i]
   328  		}
   329  		out[parts[0]] = parts[1:]
   330  	}
   331  
   332  	return scanner.Err()
   333  }
   334  
   335  // ----- DATA PARSING ---------------------------------------------------------
   336  
   337  // Parses all the data files and returns the 'Synset' field for the Wordnet
   338  // object. Path is data root directory. Example is a map from word sense to
   339  // example IDs.
   340  func parseDataFiles(path string, examples map[string][]int) (
   341  	map[string]*Synset, error) {
   342  	result := map[string]*Synset{}
   343  	for file, pos := range dataFiles {
   344  		f, err := os.Open(filepath.Join(path, file))
   345  		if err != nil {
   346  			return nil, fmt.Errorf("%s: %v", file, err)
   347  		}
   348  		err = parseDataFile(f, pos, examples, result)
   349  		f.Close()
   350  		if err != nil {
   351  			return nil, fmt.Errorf("%s: %v", file, err)
   352  		}
   353  	}
   354  	return result, nil
   355  }
   356  
   357  // Parses a single data file. Path is the data file. Pos is the POS that this
   358  // file represents. Example is a map from word sense to example IDs. Updates
   359  // out with parsed data.
   360  func parseDataFile(in io.Reader, pos string, examples map[string][]int,
   361  	out map[string]*Synset) error {
   362  	scanner := bufio.NewScanner(in)
   363  
   364  	// For each line.
   365  	lineNum := 0
   366  	for scanner.Scan() {
   367  		line := scanner.Text()
   368  		lineNum++
   369  		if strings.HasPrefix(line, "  ") { // Copyright line.
   370  			continue
   371  		}
   372  
   373  		// Parse.
   374  		raw, err := parseDataLine(line, pos == "v")
   375  		if err != nil {
   376  			return fmt.Errorf("Line %d: %v", lineNum, err)
   377  		}
   378  
   379  		// Assign.
   380  		nice := rawSynsetToNiceSynset(raw)
   381  		key := fmt.Sprintf("%v%v", pos, raw.synsetOffset)
   382  		out[key] = nice
   383  
   384  		// Handle examples.
   385  		for i, word := range raw.word {
   386  			key := fmt.Sprintf("%s.%d.%d", word.word, raw.lexFileNum,
   387  				word.lexId)
   388  			//fmt.Println(key)
   389  			for _, exampleId := range examples[key] {
   390  				nice.Example = append(nice.Example, &Example{i, exampleId})
   391  			}
   392  		}
   393  	}
   394  
   395  	return scanner.Err()
   396  }
   397  
   398  // Converts a raw parsed synset to the exported type.
   399  func rawSynsetToNiceSynset(raw *rawSynset) *Synset {
   400  	result := &Synset{
   401  		raw.synsetOffset,
   402  		raw.ssType,
   403  		make([]string, len(raw.word)),
   404  		make([]*Pointer, len(raw.ptr)),
   405  		raw.frame,
   406  		raw.gloss,
   407  		nil,
   408  	}
   409  	for _, frame := range result.Frame {
   410  		frame.WordNumber-- // Switch from 1-based to 0-based.
   411  	}
   412  	for i, word := range raw.word {
   413  		result.Word[i] = word.word
   414  	}
   415  	for i, rawPtr := range raw.ptr {
   416  		result.Pointer[i] = &Pointer{
   417  			rawPtr.symbol,
   418  			fmt.Sprintf("%v%v", rawPtr.pos, rawPtr.synsetOffset),
   419  			rawPtr.source - 1, // Switch from 1-based to 0-based.
   420  			rawPtr.target - 1, // Switch from 1-based to 0-based.
   421  		}
   422  	}
   423  
   424  	return result
   425  }
   426  
   427  // Represents a single line in a data file.
   428  type rawSynset struct {
   429  	synsetOffset string
   430  	lexFileNum   int
   431  	ssType       string
   432  	word         []*rawWord
   433  	ptr          []*rawPointer
   434  	frame        []*Frame
   435  	gloss        string
   436  }
   437  
   438  type rawPointer struct {
   439  	symbol       string
   440  	synsetOffset string
   441  	pos          string
   442  	source       int // 1-based.
   443  	target       int // 1-based.
   444  }
   445  
   446  type rawWord struct {
   447  	word  string
   448  	lexId int
   449  }
   450  
   451  // Accepted synset types.
   452  var ssTypes = map[string]bool{
   453  	"n": true,
   454  	"v": true,
   455  	"a": true,
   456  	"s": true,
   457  	"r": true,
   458  }
   459  
   460  // TODO(amit): Convert underscores in words to spaces.
   461  
   462  // Parses a single line in a data file. hasFrames is true only for the verb
   463  // file.
   464  func parseDataLine(line string, hasFrames bool) (*rawSynset, error) {
   465  	result := &rawSynset{}
   466  	var err error
   467  	parts := strings.Split(strings.Trim(line, " "), " ")
   468  	if len(parts) < 6 {
   469  		return nil, fmt.Errorf("too few fields: %d, expected at "+
   470  			"least 6", len(parts))
   471  	}
   472  
   473  	// Parse beginning of line.
   474  	result.synsetOffset = parts[0]
   475  	result.lexFileNum, err = parseDeciUint(parts[1])
   476  	if err != nil {
   477  		return nil, err
   478  	}
   479  
   480  	if !ssTypes[parts[2]] {
   481  		return nil, fmt.Errorf("unrecognized ss_type: %s", parts[2])
   482  	}
   483  	result.ssType = parts[2]
   484  
   485  	// Parse words.
   486  	wordCount, err := parseHexaUint(parts[3])
   487  	if err != nil {
   488  		return nil, err
   489  	}
   490  	parts = parts[4:]
   491  	if len(parts) < 2*wordCount+2 {
   492  		return nil, fmt.Errorf("too few fields for words: %d, expected at "+
   493  			"least %d", len(parts), 2*wordCount+2)
   494  	}
   495  	result.word = make([]*rawWord, wordCount)
   496  
   497  	for i := 0; i < wordCount; i++ {
   498  		word := &rawWord{}
   499  		word.word = parts[0]
   500  		lexId, err := parseHexaUint(parts[1])
   501  		if err != nil {
   502  			return nil, err
   503  		}
   504  		word.lexId = lexId
   505  		result.word[i] = word
   506  		parts = parts[2:]
   507  	}
   508  
   509  	// Parse pointers.
   510  	ptrCount, err := parseDeciUint(parts[0])
   511  	if err != nil {
   512  		return nil, err
   513  	}
   514  	parts = parts[1:]
   515  	if len(parts) < 4*ptrCount+1 {
   516  		return nil, fmt.Errorf("too few fields for pointers: %d, expected "+
   517  			"at least %d", len(parts), 4*ptrCount+1)
   518  	}
   519  	result.ptr = make([]*rawPointer, ptrCount)
   520  
   521  	for i := 0; i < ptrCount; i++ {
   522  		ptr := &rawPointer{}
   523  		ptr.symbol = parts[0]
   524  		ptr.synsetOffset = parts[1]
   525  		ptr.pos = parts[2]
   526  
   527  		if len(parts[3]) != 4 {
   528  			return nil, fmt.Errorf("bad pointer source/target field: %s",
   529  				parts[3])
   530  		}
   531  		ptr.source, err = parseHexaUint(parts[3][:2])
   532  		if err != nil {
   533  			return nil, err
   534  		}
   535  		ptr.target, err = parseHexaUint(parts[3][2:])
   536  		if err != nil {
   537  			return nil, err
   538  		}
   539  		result.ptr[i] = ptr
   540  
   541  		parts = parts[4:]
   542  	}
   543  
   544  	// Parse frames.
   545  	if hasFrames {
   546  		frameCount, err := parseDeciUint(parts[0])
   547  		if err != nil {
   548  			return nil, err
   549  		}
   550  		parts = parts[1:]
   551  		if len(parts) < 3*frameCount+1 {
   552  			return nil, fmt.Errorf("too few fields for frames: %d, expected "+
   553  				"at least %d", len(parts), 3*frameCount+1)
   554  		}
   555  
   556  		result.frame = make([]*Frame, frameCount)
   557  		for i := range result.frame {
   558  			f, err := parseDeciUint(parts[1])
   559  			if err != nil {
   560  				return nil, err
   561  			}
   562  			w, err := parseHexaUint(parts[2])
   563  			if err != nil {
   564  				return nil, err
   565  			}
   566  			result.frame[i] = &Frame{w, f}
   567  			parts = parts[3:]
   568  		}
   569  	}
   570  
   571  	// Parse glossary.
   572  	if parts[0] != "|" {
   573  		return nil, fmt.Errorf("expected '|' at end of fields, but found "+
   574  			"'%s'", parts[0])
   575  	}
   576  	result.gloss = strings.Join(parts[1:], " ")
   577  
   578  	return result, nil
   579  }
   580  
   581  // ----- UTILS ----------------------------------------------------------------
   582  
   583  // Now what in the world were they thinking when they put hexa and decimal in
   584  // the same format? Academics and code. -_-
   585  
   586  func parseHexaUint(s string) (int, error) {
   587  	i, err := strconv.ParseUint(s, 16, 0)
   588  	return int(i), err
   589  }
   590  
   591  func parseDeciUint(s string) (int, error) {
   592  	i, err := strconv.ParseUint(s, 10, 0)
   593  	return int(i), err
   594  }
   595  
   596  // Pointer symbol meanings.
   597  const (
   598  	Antonym                   = "!"
   599  	Hypernym                  = "@"
   600  	InstanceHypernym          = "@i"
   601  	Hyponym                   = "~"
   602  	InstanceHyponym           = "~i"
   603  	MemberHolonym             = "#m"
   604  	SubstanceHolonym          = "#s"
   605  	PartHolonym               = "#p"
   606  	MemberMeronym             = "%m"
   607  	SubstanceMeronym          = "%s"
   608  	PartMeronym               = "%p"
   609  	Attribute                 = "="
   610  	DerivationallyRelatedForm = "+"
   611  	DomainOfSynsetTopic       = ";c"
   612  	MemberOfThisDomainTopic   = "-c"
   613  	DomainOfSynsetRegion      = ";r"
   614  	MemberOfThisDomainRegion  = "-r"
   615  	DomainOfSynsetUsage       = ";u"
   616  	MemberOfThisDomainUsage   = "-u"
   617  	Entailment                = "*"
   618  	Cause                     = ">"
   619  	AlsoSee                   = "^"
   620  	VerbGroup                 = "$"
   621  	SimilarTo                 = "&"
   622  	ParticipleOfVerb          = "<"
   623  	Pertainym                 = "\\"
   624  	DerivedFromAdjective      = "\\"
   625  )