github.com/jancarloviray/community@v0.41.1-0.20170124221257-33a66c87cf2f/core/api/convert/excerpt/excerpt.go (about) 1 // Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved. 2 // 3 // This software (Documize Community Edition) is licensed under 4 // GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html 5 // 6 // You can operate outside the AGPL restrictions by purchasing 7 // Documize Enterprise Edition and obtaining a commercial license 8 // by contacting <sales@documize.com>. 9 // 10 // https://documize.com 11 12 // Package excerpt provides basic functionality to create excerpts of text in English. 13 package excerpt 14 15 import ( 16 "sort" 17 "strings" 18 "unicode" 19 "unicode/utf8" 20 21 words "github.com/documize/community/core/wordlists/en-2012" 22 23 "github.com/rookii/paicehusk" 24 ) 25 26 type extractItem struct { 27 sequence int 28 score float64 29 count int 30 sentance string 31 } 32 33 type extractList []extractItem 34 35 // the Sort interface 36 // Len is the number of elements in the collection. 37 func (a extractList) Len() int { return len(a) } 38 39 // Less reports whether the element with 40 // index i should sort before the element with index j. 41 func (a extractList) Less(i, j int) bool { 42 return (a[i].score / float64(a[i].count)) > (a[j].score / float64(a[j].count)) 43 } 44 45 // Swap swaps the elements with indexes i and j. 46 func (a extractList) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 47 48 type presentItem struct { 49 sequence int 50 text string 51 } 52 53 type presentList []presentItem 54 55 // the Sort interface 56 // Len is the number of elements in the collection. 57 func (a presentList) Len() int { return len(a) } 58 59 // Less reports whether the element with 60 // index i should sort before the element with index j. 61 func (a presentList) Less(i, j int) bool { 62 return a[i].sequence < a[j].sequence 63 } 64 65 // Swap swaps the elements with indexes i and j. 66 func (a presentList) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 67 68 func addWd(sentance, wd string) (string, bool) { 69 var isStop bool 70 if len(sentance) == 0 { 71 if wd != "[" { 72 sentance = wd 73 } 74 } else { 75 switch wd { 76 case "[": //NoOp 77 case "0", "1", "2", "3", "4", "5", "6", "7", "8", "9": 78 if unicode.IsDigit(rune(sentance[len(sentance)-1])) { 79 sentance += wd 80 } else { 81 sentance += " " + wd 82 } 83 case ".", "!", "?": 84 isStop = true 85 fallthrough 86 default: 87 if isPunct(wd) { 88 sentance += wd 89 } else { 90 sentance += " " + wd 91 } 92 } 93 } 94 return sentance, isStop 95 } 96 97 func isPunct(s string) bool { 98 for _, r := range s { 99 if !unicode.IsPunct(r) { 100 switch r { 101 case '`', '\'', '"', '(', '/': // still punct 102 default: 103 return false 104 } 105 } 106 } 107 return true 108 } 109 110 // Excerpt returns the most statically significant 100 or so words of text for use in the Excerpt field 111 func Excerpt(titleWords, bodyWords []string) string { 112 var el extractList 113 114 //fmt.Println("DEBUG Excerpt ", len(titleWords), len(bodyWords)) 115 116 // populate stemMap 117 stemMap := make(map[string]uint64) 118 for _, wd := range bodyWords { 119 stem := paicehusk.DefaultRules.Stem(wd) // find the stem of the word 120 stemMap[stem]++ 121 } 122 for _, wd := range titleWords { 123 stem := paicehusk.DefaultRules.Stem(wd) // find the stem of the word 124 stemMap[stem]++ // TODO are words in titles more important? 125 } 126 127 wds := append(titleWords, bodyWords...) 128 129 sentance := "" 130 score := 0.0 131 count := 0 132 seq := 0 133 for _, wd := range wds { 134 var isStop bool 135 136 sentance, isStop = addWd(sentance, wd) 137 138 if isStop { 139 //fmt.Printf(" DEBUG sentance: %3d %3.2f %s\n", 140 // seq, score*10000/float64(count), sentance) 141 var ei extractItem 142 ei.count = count + 1 // must be at least 1 143 ei.score = score 144 ei.sentance = sentance 145 ei.sequence = seq 146 el = append(el, ei) 147 sentance = "" 148 score = 0.0 149 seq++ 150 } else { 151 uncommon := true 152 // TODO Discuss correct level or maybe find a better algorithem for this 153 ent, ok := words.Words[wd] 154 if ok { 155 if ent.Rank <= 100 { 156 // do not score very common words 157 uncommon = false 158 } 159 } 160 if uncommon { 161 stem := paicehusk.DefaultRules.Stem(wd) // find the stem of the word 162 usage, used := stemMap[stem] 163 if used { 164 relativeStemFreq := (float64(usage) / float64(len(wds))) - words.Stems[stem] 165 if relativeStemFreq > 0.0 { 166 score += relativeStemFreq 167 } 168 } 169 count++ 170 } 171 } 172 } 173 174 sort.Sort(el) 175 176 return present(el) 177 } 178 179 func present(el extractList) (ret string) { 180 var pl presentList 181 words := 0 182 183 const excerptWords = 50 184 185 for s, e := range el { 186 if (words < excerptWords || s == 0) && len(e.sentance) > 1 && 187 notEmpty(e.sentance) { 188 words += e.count 189 pl = append(pl, presentItem{sequence: e.sequence, text: e.sentance}) 190 //fmt.Printf("DEBUG With score %3.2f on page %d // %s \n", 191 // 1000*e.score/float64(e.count), e.sequence, e.sentance) 192 } 193 } 194 sort.Sort(pl) 195 196 var lastSeq int 197 for p := range pl { 198 txt := strings.TrimPrefix(pl[p].text, ". ") 199 if p == 0 { 200 ret = txt 201 lastSeq = pl[0].sequence 202 } else { 203 thisSeq := pl[p].sequence 204 if lastSeq+1 != thisSeq { 205 ret += " …" // Horizontal elipsis character 206 } 207 ret += " " + txt 208 lastSeq = thisSeq 209 } 210 } 211 if len(ret) > 250 { // make sure the excerpt is not too long, shorten it if required 212 for len(ret) > 250 { 213 _, size := utf8.DecodeLastRuneInString(ret) 214 ret = ret[:len(ret)-size] 215 } 216 return ret + "…" // Horizontal elipsis character added after truncation 217 } 218 return ret 219 } 220 221 func notEmpty(wds string) bool { 222 for _, r := range wds { 223 if !unicode.IsPunct(r) && !unicode.IsSpace(r) { 224 return true 225 } 226 } 227 return false 228 }