github.com/fluhus/gostuff@v0.4.1-0.20240331134726-be71864f2b5d/nlp/wordnet/parser.go (about) 1 package wordnet 2 3 import ( 4 "bufio" 5 "fmt" 6 "io" 7 "os" 8 "path/filepath" 9 "strconv" 10 "strings" 11 ) 12 13 // TODO(amit): Convenience functions for pointers? 14 15 // ----- FILE LISTS ----------------------------------------------------------- 16 17 var ( 18 dataFiles = map[string]string{ 19 "data.adj": "a", 20 "data.adv": "r", 21 "data.noun": "n", 22 "data.verb": "v", 23 } 24 exceptionFiles = map[string]string{ 25 "adj.exc": "a", 26 "adv.exc": "r", 27 "noun.exc": "n", 28 "verb.exc": "v", 29 } 30 indexFiles = []string{ 31 "index.adj", 32 "index.adv", 33 "index.noun", 34 "index.verb", 35 } 36 exampleFile = "sents.vrb" 37 exampleIndexFile = "sentidx.vrb" 38 ) 39 40 // ----- LEMMA INDEX PARSING -------------------------------------------------- 41 42 // Parses the index files. 43 func parseIndexFiles(path string) (map[string][]string, error) { 44 result := map[string][]string{} 45 46 for _, file := range indexFiles { 47 // Read index file. 48 f, err := os.Open(filepath.Join(path, file)) 49 if err != nil { 50 return nil, fmt.Errorf("%v: %v", file, err) 51 } 52 m, err := parseIndex(f) 53 f.Close() 54 if err != nil { 55 return nil, fmt.Errorf("%v: %v", file, err) 56 } 57 58 // Merge index with result. 59 for lemma := range m { 60 result[lemma] = m[lemma] 61 } 62 } 63 64 return result, nil 65 } 66 67 // Parses the contents of an index file. 68 func parseIndex(r io.Reader) (map[string][]string, error) { 69 result := map[string][]string{} 70 scanner := bufio.NewScanner(r) 71 72 lineNum := 0 73 for scanner.Scan() { 74 lineNum++ 75 if strings.HasPrefix(scanner.Text(), " ") { // Copyright line. 76 continue 77 } 78 79 line, err := parseIndexLine(scanner.Text()) 80 if err != nil { 81 return nil, fmt.Errorf("Line %d: %v", lineNum, err) 82 } 83 84 if len(line.synset) == 1 { 85 line.ranked = 1 86 } 87 for i := range line.synset { 88 line.synset[i] = line.pos + line.synset[i] 89 } 90 if line.ranked > 0 { 91 result[line.pos+"."+line.lemma] = line.synset[:line.ranked] 92 } 93 } 94 95 return result, nil 96 } 97 98 // A single line in an index file. 99 type indexLine struct { 100 lemma string 101 pos string 102 ptr []string 103 synset []string 104 ranked int 105 } 106 107 // Parses an index file line. 108 func parseIndexLine(line string) (*indexLine, error) { 109 result := &indexLine{} 110 parts := strings.Split(strings.Trim(line, " "), " ") 111 112 if len(parts) < 7 { 113 return nil, fmt.Errorf("bad number of parts: %d, expected at least 7", 114 len(parts)) 115 } 116 117 result.lemma = parts[0] 118 result.pos = parts[1] 119 120 synsetCount, err := parseDeciUint(parts[2]) 121 if err != nil { 122 return nil, fmt.Errorf("bad synset count: %s", parts[2]) 123 } 124 ptrCount, err := parseDeciUint(parts[3]) 125 if err != nil { 126 return nil, fmt.Errorf("bad pointer count: %s", parts[3]) 127 } 128 129 parts = parts[4:] 130 if len(parts) < ptrCount+2+synsetCount { 131 return nil, fmt.Errorf("bad number of parts: %d, expected %d", 132 len(parts)+4, ptrCount+synsetCount+6) 133 } 134 135 result.ptr = parts[:ptrCount] 136 parts = parts[ptrCount:] 137 138 result.ranked, err = parseDeciUint(parts[1]) 139 if err != nil { 140 return nil, fmt.Errorf("Bad tagsense count: %s", parts[1]) 141 } 142 143 result.synset = parts[2:] 144 if result.ranked > len(result.synset) { 145 return nil, fmt.Errorf("Bad tagsense-count: %d is greated than "+ 146 "synset count %d.", result.ranked, len(result.synset)) 147 } 148 149 return result, nil 150 } 151 152 // ----- VERB EXAMPLE PARSING ------------------------------------------------- 153 154 // Parses the verb example file. 155 func parseExampleFile(path string) (map[string]string, error) { 156 f, err := os.Open(filepath.Join(path, exampleFile)) 157 if err != nil { 158 return nil, fmt.Errorf("%s: %v", exampleFile, err) 159 } 160 defer f.Close() 161 return parseExamples(f) 162 } 163 164 // Parses a verb example file. 165 func parseExamples(r io.Reader) (map[string]string, error) { 166 result := map[string]string{} 167 scanner := bufio.NewScanner(r) 168 169 lineNum := 0 170 for scanner.Scan() { 171 lineNum++ 172 parts := strings.Split(scanner.Text(), " ") 173 if len(parts) == 0 { 174 return nil, fmt.Errorf("line %d: No data to parse", lineNum) 175 } 176 _, err := parseDeciUint(parts[0]) 177 if err != nil { 178 return nil, fmt.Errorf("line %d: %v", lineNum, err) 179 } 180 result[parts[0]] = strings.Join(parts[1:], " ") 181 } 182 183 return result, nil 184 } 185 186 // Parses the verb example index file. 187 func parseExampleIndexFile(path string) (map[string][]int, error) { 188 f, err := os.Open(filepath.Join(path, exampleIndexFile)) 189 if err != nil { 190 return nil, fmt.Errorf("%s: %v", exampleIndexFile, err) 191 } 192 defer f.Close() 193 return parseExampleIndex(f) 194 } 195 196 // Parses an entire verb example index file. 197 func parseExampleIndex(r io.Reader) (map[string][]int, error) { 198 result := map[string][]int{} 199 scanner := bufio.NewScanner(r) 200 201 lineNum := 0 202 for scanner.Scan() { 203 lineNum++ 204 raw, err := parseExampleIndexLine(scanner.Text()) 205 if err != nil { 206 return nil, fmt.Errorf("line %d: %v", lineNum, err) 207 } 208 key := fmt.Sprintf("%s.%d.%d", raw.lemma, raw.lexFileNum, raw.lexId) 209 result[key] = raw.exampleIds 210 } 211 212 if scanner.Err() != nil { 213 return nil, scanner.Err() 214 } 215 216 return result, nil 217 } 218 219 // Represents a single line in the verb example index file. 220 type rawExampleIndex struct { 221 lemma string 222 pos int 223 lexFileNum int 224 lexId int 225 headWord string 226 headId int 227 exampleIds []int 228 } 229 230 // Parses a single line in the lemma-example index file. 231 func parseExampleIndexLine(line string) (*rawExampleIndex, error) { 232 result := &rawExampleIndex{} 233 parts := strings.Split(line, " ") 234 if len(parts) != 2 { 235 return nil, fmt.Errorf("bad number of parts: %d, expected 2", 236 len(parts)) 237 } 238 239 // Parse sense. 240 senseParts := strings.Split(parts[0], "%") 241 if len(senseParts) != 2 { 242 return nil, fmt.Errorf("bad number of sense-key parts: %d, expected"+ 243 " 2", len(senseParts)) 244 } 245 246 result.lemma = senseParts[0] 247 lexSenseParts := strings.Split(senseParts[1], ":") 248 if len(lexSenseParts) != 5 { 249 return nil, fmt.Errorf("bad number of lex-sense parts: %d, expected"+ 250 " 5", len(lexSenseParts)) 251 } 252 253 // Parse lex-sense. 254 var err error 255 result.pos, err = parseDeciUint(lexSenseParts[0]) 256 if err != nil { 257 return nil, err 258 } 259 result.lexFileNum, err = parseDeciUint(lexSenseParts[1]) 260 if err != nil { 261 return nil, err 262 } 263 result.lexId, err = parseDeciUint(lexSenseParts[2]) 264 if err != nil { 265 return nil, err 266 } 267 result.headWord = lexSenseParts[3] 268 if result.headWord != "" { 269 result.headId, err = parseDeciUint(lexSenseParts[4]) 270 if err != nil { 271 return nil, err 272 } 273 } 274 275 // Parse example numbers. 276 if parts[1] != "" { 277 numParts := strings.Split(parts[1], ",") 278 nums := make([]int, len(numParts)) 279 for i := range numParts { 280 nums[i], err = parseDeciUint(numParts[i]) 281 if err != nil { 282 return nil, err 283 } 284 } 285 result.exampleIds = nums 286 } 287 288 return result, nil 289 } 290 291 // ----- EXCEPTION PARSING ---------------------------------------------------- 292 293 func parseExceptionFiles(path string) (map[string][]string, error) { 294 result := map[string][]string{} 295 for file, pos := range exceptionFiles { 296 f, err := os.Open(filepath.Join(path, file)) 297 if err != nil { 298 return nil, fmt.Errorf("%s: %v", file, err) 299 } 300 err = parseExceptionFile(f, pos, result) 301 f.Close() 302 if err != nil { 303 return nil, fmt.Errorf("%s: %v", file, err) 304 } 305 } 306 return result, nil 307 } 308 309 // Parses a single exception file. Adds keys to out that point to already 310 // existing values. 311 func parseExceptionFile(in io.Reader, pos string, out map[string][]string, 312 ) error { 313 scanner := bufio.NewScanner(in) 314 315 // For each line. 316 lineNum := 0 317 for scanner.Scan() { 318 lineNum++ 319 line := scanner.Text() 320 parts := strings.Split(line, " ") 321 if len(parts) < 2 { 322 return fmt.Errorf("line %d: Bad number of fields: %d, expected 2", 323 lineNum, len(parts)) 324 } 325 326 for i := range parts { 327 parts[i] = pos + "." + parts[i] 328 } 329 out[parts[0]] = parts[1:] 330 } 331 332 return scanner.Err() 333 } 334 335 // ----- DATA PARSING --------------------------------------------------------- 336 337 // Parses all the data files and returns the 'Synset' field for the Wordnet 338 // object. Path is data root directory. Example is a map from word sense to 339 // example IDs. 340 func parseDataFiles(path string, examples map[string][]int) ( 341 map[string]*Synset, error) { 342 result := map[string]*Synset{} 343 for file, pos := range dataFiles { 344 f, err := os.Open(filepath.Join(path, file)) 345 if err != nil { 346 return nil, fmt.Errorf("%s: %v", file, err) 347 } 348 err = parseDataFile(f, pos, examples, result) 349 f.Close() 350 if err != nil { 351 return nil, fmt.Errorf("%s: %v", file, err) 352 } 353 } 354 return result, nil 355 } 356 357 // Parses a single data file. Path is the data file. Pos is the POS that this 358 // file represents. Example is a map from word sense to example IDs. Updates 359 // out with parsed data. 360 func parseDataFile(in io.Reader, pos string, examples map[string][]int, 361 out map[string]*Synset) error { 362 scanner := bufio.NewScanner(in) 363 364 // For each line. 365 lineNum := 0 366 for scanner.Scan() { 367 line := scanner.Text() 368 lineNum++ 369 if strings.HasPrefix(line, " ") { // Copyright line. 370 continue 371 } 372 373 // Parse. 374 raw, err := parseDataLine(line, pos == "v") 375 if err != nil { 376 return fmt.Errorf("Line %d: %v", lineNum, err) 377 } 378 379 // Assign. 380 nice := rawSynsetToNiceSynset(raw) 381 key := fmt.Sprintf("%v%v", pos, raw.synsetOffset) 382 out[key] = nice 383 384 // Handle examples. 385 for i, word := range raw.word { 386 key := fmt.Sprintf("%s.%d.%d", word.word, raw.lexFileNum, 387 word.lexId) 388 //fmt.Println(key) 389 for _, exampleId := range examples[key] { 390 nice.Example = append(nice.Example, &Example{i, exampleId}) 391 } 392 } 393 } 394 395 return scanner.Err() 396 } 397 398 // Converts a raw parsed synset to the exported type. 399 func rawSynsetToNiceSynset(raw *rawSynset) *Synset { 400 result := &Synset{ 401 raw.synsetOffset, 402 raw.ssType, 403 make([]string, len(raw.word)), 404 make([]*Pointer, len(raw.ptr)), 405 raw.frame, 406 raw.gloss, 407 nil, 408 } 409 for _, frame := range result.Frame { 410 frame.WordNumber-- // Switch from 1-based to 0-based. 411 } 412 for i, word := range raw.word { 413 result.Word[i] = word.word 414 } 415 for i, rawPtr := range raw.ptr { 416 result.Pointer[i] = &Pointer{ 417 rawPtr.symbol, 418 fmt.Sprintf("%v%v", rawPtr.pos, rawPtr.synsetOffset), 419 rawPtr.source - 1, // Switch from 1-based to 0-based. 420 rawPtr.target - 1, // Switch from 1-based to 0-based. 421 } 422 } 423 424 return result 425 } 426 427 // Represents a single line in a data file. 428 type rawSynset struct { 429 synsetOffset string 430 lexFileNum int 431 ssType string 432 word []*rawWord 433 ptr []*rawPointer 434 frame []*Frame 435 gloss string 436 } 437 438 type rawPointer struct { 439 symbol string 440 synsetOffset string 441 pos string 442 source int // 1-based. 443 target int // 1-based. 444 } 445 446 type rawWord struct { 447 word string 448 lexId int 449 } 450 451 // Accepted synset types. 452 var ssTypes = map[string]bool{ 453 "n": true, 454 "v": true, 455 "a": true, 456 "s": true, 457 "r": true, 458 } 459 460 // TODO(amit): Convert underscores in words to spaces. 461 462 // Parses a single line in a data file. hasFrames is true only for the verb 463 // file. 464 func parseDataLine(line string, hasFrames bool) (*rawSynset, error) { 465 result := &rawSynset{} 466 var err error 467 parts := strings.Split(strings.Trim(line, " "), " ") 468 if len(parts) < 6 { 469 return nil, fmt.Errorf("too few fields: %d, expected at "+ 470 "least 6", len(parts)) 471 } 472 473 // Parse beginning of line. 474 result.synsetOffset = parts[0] 475 result.lexFileNum, err = parseDeciUint(parts[1]) 476 if err != nil { 477 return nil, err 478 } 479 480 if !ssTypes[parts[2]] { 481 return nil, fmt.Errorf("unrecognized ss_type: %s", parts[2]) 482 } 483 result.ssType = parts[2] 484 485 // Parse words. 486 wordCount, err := parseHexaUint(parts[3]) 487 if err != nil { 488 return nil, err 489 } 490 parts = parts[4:] 491 if len(parts) < 2*wordCount+2 { 492 return nil, fmt.Errorf("too few fields for words: %d, expected at "+ 493 "least %d", len(parts), 2*wordCount+2) 494 } 495 result.word = make([]*rawWord, wordCount) 496 497 for i := 0; i < wordCount; i++ { 498 word := &rawWord{} 499 word.word = parts[0] 500 lexId, err := parseHexaUint(parts[1]) 501 if err != nil { 502 return nil, err 503 } 504 word.lexId = lexId 505 result.word[i] = word 506 parts = parts[2:] 507 } 508 509 // Parse pointers. 510 ptrCount, err := parseDeciUint(parts[0]) 511 if err != nil { 512 return nil, err 513 } 514 parts = parts[1:] 515 if len(parts) < 4*ptrCount+1 { 516 return nil, fmt.Errorf("too few fields for pointers: %d, expected "+ 517 "at least %d", len(parts), 4*ptrCount+1) 518 } 519 result.ptr = make([]*rawPointer, ptrCount) 520 521 for i := 0; i < ptrCount; i++ { 522 ptr := &rawPointer{} 523 ptr.symbol = parts[0] 524 ptr.synsetOffset = parts[1] 525 ptr.pos = parts[2] 526 527 if len(parts[3]) != 4 { 528 return nil, fmt.Errorf("bad pointer source/target field: %s", 529 parts[3]) 530 } 531 ptr.source, err = parseHexaUint(parts[3][:2]) 532 if err != nil { 533 return nil, err 534 } 535 ptr.target, err = parseHexaUint(parts[3][2:]) 536 if err != nil { 537 return nil, err 538 } 539 result.ptr[i] = ptr 540 541 parts = parts[4:] 542 } 543 544 // Parse frames. 545 if hasFrames { 546 frameCount, err := parseDeciUint(parts[0]) 547 if err != nil { 548 return nil, err 549 } 550 parts = parts[1:] 551 if len(parts) < 3*frameCount+1 { 552 return nil, fmt.Errorf("too few fields for frames: %d, expected "+ 553 "at least %d", len(parts), 3*frameCount+1) 554 } 555 556 result.frame = make([]*Frame, frameCount) 557 for i := range result.frame { 558 f, err := parseDeciUint(parts[1]) 559 if err != nil { 560 return nil, err 561 } 562 w, err := parseHexaUint(parts[2]) 563 if err != nil { 564 return nil, err 565 } 566 result.frame[i] = &Frame{w, f} 567 parts = parts[3:] 568 } 569 } 570 571 // Parse glossary. 572 if parts[0] != "|" { 573 return nil, fmt.Errorf("expected '|' at end of fields, but found "+ 574 "'%s'", parts[0]) 575 } 576 result.gloss = strings.Join(parts[1:], " ") 577 578 return result, nil 579 } 580 581 // ----- UTILS ---------------------------------------------------------------- 582 583 // Now what in the world were they thinking when they put hexa and decimal in 584 // the same format? Academics and code. -_- 585 586 func parseHexaUint(s string) (int, error) { 587 i, err := strconv.ParseUint(s, 16, 0) 588 return int(i), err 589 } 590 591 func parseDeciUint(s string) (int, error) { 592 i, err := strconv.ParseUint(s, 10, 0) 593 return int(i), err 594 } 595 596 // Pointer symbol meanings. 597 const ( 598 Antonym = "!" 599 Hypernym = "@" 600 InstanceHypernym = "@i" 601 Hyponym = "~" 602 InstanceHyponym = "~i" 603 MemberHolonym = "#m" 604 SubstanceHolonym = "#s" 605 PartHolonym = "#p" 606 MemberMeronym = "%m" 607 SubstanceMeronym = "%s" 608 PartMeronym = "%p" 609 Attribute = "=" 610 DerivationallyRelatedForm = "+" 611 DomainOfSynsetTopic = ";c" 612 MemberOfThisDomainTopic = "-c" 613 DomainOfSynsetRegion = ";r" 614 MemberOfThisDomainRegion = "-r" 615 DomainOfSynsetUsage = ";u" 616 MemberOfThisDomainUsage = "-u" 617 Entailment = "*" 618 Cause = ">" 619 AlsoSee = "^" 620 VerbGroup = "$" 621 SimilarTo = "&" 622 ParticipleOfVerb = "<" 623 Pertainym = "\\" 624 DerivedFromAdjective = "\\" 625 )