github.com/geniusesgroup/libgo@v0.0.0-20220713101832-828057a9d3d4/matn/text-index-set.go

github.com/geniusesgroup/libgo@v0.0.0-20220713101832-828057a9d3d4/matn/text-index-set.go (about)

     1  /* For license and copyright information please see LEGAL file in repository */
     2  
     3  package matn
     4  
     5  import (
     6  	er "../error"
     7  )
     8  
     9  // TextIndexSetReq is request structure of TextIndex()
    10  type TextIndexSetReq struct {
    11  	RecordID           [32]byte
    12  	RecordStructure    uint64
    13  	RecordPrimaryKey   [32]byte // Store any primary ID or any data up to 32 byte length e.g. ID
    14  	RecordSecondaryKey [32]byte // Store any secondary ID or any data up to 32 byte length e.g. GroupID
    15  	RecordOwnerID      [32]byte
    16  	RecordFieldID      uint8
    17  	Text               string
    18  }
    19  
    20  // TextIndexSet index given text
    21  func TextIndexSet(req *TextIndexSetReq) (err protocol.Error) {
    22  	var indexes = WordTokenization(req)
    23  
    24  	for _, index := range indexes {
    25  		err = index.SaveOrUpdate()
    26  	}
    27  
    28  	// TODO::: need to index or do anything here about phrases???
    29  
    30  	return
    31  }
    32  
    33  // WordTokenization uses the delimiters categorized under [Dash, Hyphen, Pattern_Syntax, Quotation_Mark, Terminal_Punctuation, White_Space]
    34  // https://github.com/jdkato/prose
    35  func WordTokenization(req *TextIndexSetReq) (indexes map[string]*IndexWord) {
    36  	// var fields = strings.Fields(req.Text)
    37  	// fmt.Println(fields)
    38  
    39  	indexes = map[string]*IndexWord{}
    40  
    41  	var (
    42  		ok                bool
    43  		index             *IndexWord
    44  		word              string
    45  		wordStart         bool
    46  		sentenceEnd       bool
    47  		lastSentenceIndex int
    48  
    49  		WordOffsetInSentence uint64 //  Position of the word in the sentence
    50  		WordOffsetInText     uint64 //  Position of the word in the text
    51  		OffsetInSentence     uint64 //  First word charecter possition in the sentence
    52  		OffsetInText         uint64 //  First word charecter possition in the text
    53  	)
    54  
    55  	// TODO::: drop language-specific stop words??? (e.g. in English, the, an, a, and, etc.)
    56  	for i, char := range req.Text + " " { // TODO::: hack situation!! need to fix this and remove + " "
    57  		switch char {
    58  		case ' ', '\t', '\n', '\v', '\f', '\r', 0x85, 0xA0, '\'', '"', '`', ':', ',', '-': // unicode.IsSpace(char)
    59  			if wordStart {
    60  				word = req.Text[OffsetInText:i]
    61  			} else {
    62  				OffsetInSentence = uint64(i-lastSentenceIndex) + 1 // indicate next charecter as start of word
    63  				OffsetInText = uint64(i) + 1                       // indicate next charecter as start of word
    64  				continue
    65  			}
    66  		case '_':
    67  			// TODO:::
    68  			continue
    69  		case '#':
    70  			// TODO:::
    71  			continue
    72  		case '@':
    73  			// TODO:::
    74  			continue
    75  		case '$':
    76  			// TODO:::
    77  			continue
    78  		case '.', ';', '?', '!':
    79  			sentenceEnd = true
    80  			lastSentenceIndex = i
    81  			if wordStart {
    82  				word = req.Text[OffsetInText:i]
    83  			} else {
    84  				continue
    85  			}
    86  		case '[', '(', '{':
    87  		case ']', ')', '}':
    88  		default:
    89  			wordStart = true
    90  			continue
    91  		}
    92  
    93  		index, ok = indexes[word]
    94  		if ok {
    95  			index.Tokens = append(index.Tokens, WordToken{
    96  				RecordID:             req.RecordID,
    97  				RecordFieldID:        req.RecordFieldID,
    98  				WordOffsetInSentence: WordOffsetInSentence,
    99  				WordOffsetInText:     WordOffsetInText,
   100  				OffsetInSentence:     OffsetInSentence,
   101  				OffsetInText:         OffsetInText,
   102  			})
   103  		} else {
   104  			index = &IndexWord{
   105  				Word:               word,
   106  				RecordStructure:    req.RecordStructure,
   107  				RecordPrimaryKey:   req.RecordPrimaryKey,
   108  				RecordSecondaryKey: req.RecordSecondaryKey,
   109  				RecordOwnerID:      req.RecordOwnerID,
   110  				Tokens: []WordToken{
   111  					{
   112  						RecordID:             req.RecordID,
   113  						RecordFieldID:        req.RecordFieldID,
   114  						WordOffsetInSentence: WordOffsetInSentence,
   115  						WordOffsetInText:     WordOffsetInText,
   116  						OffsetInSentence:     OffsetInSentence,
   117  						OffsetInText:         OffsetInText,
   118  					},
   119  				},
   120  			}
   121  			indexes[word] = index
   122  		}
   123  
   124  		OffsetInSentence = uint64(i-lastSentenceIndex) + 1 // indicate next charecter as start of word
   125  		OffsetInText = uint64(i) + 1                       // indicate next charecter as start of word
   126  		WordOffsetInSentence++
   127  		WordOffsetInText++
   128  		wordStart = false
   129  
   130  		if sentenceEnd {
   131  			sentenceEnd = false
   132  			WordOffsetInSentence = 0
   133  			OffsetInSentence = 0
   134  		}
   135  	}
   136  	return
   137  }