github.com/geniusesgroup/libgo@v0.0.0-20220713101832-828057a9d3d4/matn/text-index-set.go (about) 1 /* For license and copyright information please see LEGAL file in repository */ 2 3 package matn 4 5 import ( 6 er "../error" 7 ) 8 9 // TextIndexSetReq is request structure of TextIndex() 10 type TextIndexSetReq struct { 11 RecordID [32]byte 12 RecordStructure uint64 13 RecordPrimaryKey [32]byte // Store any primary ID or any data up to 32 byte length e.g. ID 14 RecordSecondaryKey [32]byte // Store any secondary ID or any data up to 32 byte length e.g. GroupID 15 RecordOwnerID [32]byte 16 RecordFieldID uint8 17 Text string 18 } 19 20 // TextIndexSet index given text 21 func TextIndexSet(req *TextIndexSetReq) (err protocol.Error) { 22 var indexes = WordTokenization(req) 23 24 for _, index := range indexes { 25 err = index.SaveOrUpdate() 26 } 27 28 // TODO::: need to index or do anything here about phrases??? 29 30 return 31 } 32 33 // WordTokenization uses the delimiters categorized under [Dash, Hyphen, Pattern_Syntax, Quotation_Mark, Terminal_Punctuation, White_Space] 34 // https://github.com/jdkato/prose 35 func WordTokenization(req *TextIndexSetReq) (indexes map[string]*IndexWord) { 36 // var fields = strings.Fields(req.Text) 37 // fmt.Println(fields) 38 39 indexes = map[string]*IndexWord{} 40 41 var ( 42 ok bool 43 index *IndexWord 44 word string 45 wordStart bool 46 sentenceEnd bool 47 lastSentenceIndex int 48 49 WordOffsetInSentence uint64 // Position of the word in the sentence 50 WordOffsetInText uint64 // Position of the word in the text 51 OffsetInSentence uint64 // First word charecter possition in the sentence 52 OffsetInText uint64 // First word charecter possition in the text 53 ) 54 55 // TODO::: drop language-specific stop words??? (e.g. in English, the, an, a, and, etc.) 56 for i, char := range req.Text + " " { // TODO::: hack situation!! need to fix this and remove + " " 57 switch char { 58 case ' ', '\t', '\n', '\v', '\f', '\r', 0x85, 0xA0, '\'', '"', '`', ':', ',', '-': // unicode.IsSpace(char) 59 if wordStart { 60 word = req.Text[OffsetInText:i] 61 } else { 62 OffsetInSentence = uint64(i-lastSentenceIndex) + 1 // indicate next charecter as start of word 63 OffsetInText = uint64(i) + 1 // indicate next charecter as start of word 64 continue 65 } 66 case '_': 67 // TODO::: 68 continue 69 case '#': 70 // TODO::: 71 continue 72 case '@': 73 // TODO::: 74 continue 75 case '$': 76 // TODO::: 77 continue 78 case '.', ';', '?', '!': 79 sentenceEnd = true 80 lastSentenceIndex = i 81 if wordStart { 82 word = req.Text[OffsetInText:i] 83 } else { 84 continue 85 } 86 case '[', '(', '{': 87 case ']', ')', '}': 88 default: 89 wordStart = true 90 continue 91 } 92 93 index, ok = indexes[word] 94 if ok { 95 index.Tokens = append(index.Tokens, WordToken{ 96 RecordID: req.RecordID, 97 RecordFieldID: req.RecordFieldID, 98 WordOffsetInSentence: WordOffsetInSentence, 99 WordOffsetInText: WordOffsetInText, 100 OffsetInSentence: OffsetInSentence, 101 OffsetInText: OffsetInText, 102 }) 103 } else { 104 index = &IndexWord{ 105 Word: word, 106 RecordStructure: req.RecordStructure, 107 RecordPrimaryKey: req.RecordPrimaryKey, 108 RecordSecondaryKey: req.RecordSecondaryKey, 109 RecordOwnerID: req.RecordOwnerID, 110 Tokens: []WordToken{ 111 { 112 RecordID: req.RecordID, 113 RecordFieldID: req.RecordFieldID, 114 WordOffsetInSentence: WordOffsetInSentence, 115 WordOffsetInText: WordOffsetInText, 116 OffsetInSentence: OffsetInSentence, 117 OffsetInText: OffsetInText, 118 }, 119 }, 120 } 121 indexes[word] = index 122 } 123 124 OffsetInSentence = uint64(i-lastSentenceIndex) + 1 // indicate next charecter as start of word 125 OffsetInText = uint64(i) + 1 // indicate next charecter as start of word 126 WordOffsetInSentence++ 127 WordOffsetInText++ 128 wordStart = false 129 130 if sentenceEnd { 131 sentenceEnd = false 132 WordOffsetInSentence = 0 133 OffsetInSentence = 0 134 } 135 } 136 return 137 }