github.com/jancarloviray/community@v0.41.1-0.20170124221257-33a66c87cf2f/core/api/convert/excerpt/excerpt.go

github.com/jancarloviray/community@v0.41.1-0.20170124221257-33a66c87cf2f/core/api/convert/excerpt/excerpt.go (about)

     1  // Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
     2  //
     3  // This software (Documize Community Edition) is licensed under
     4  // GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
     5  //
     6  // You can operate outside the AGPL restrictions by purchasing
     7  // Documize Enterprise Edition and obtaining a commercial license
     8  // by contacting <sales@documize.com>.
     9  //
    10  // https://documize.com
    11  
    12  // Package excerpt provides basic functionality to create excerpts of text in English.
    13  package excerpt
    14  
    15  import (
    16  	"sort"
    17  	"strings"
    18  	"unicode"
    19  	"unicode/utf8"
    20  
    21  	words "github.com/documize/community/core/wordlists/en-2012"
    22  
    23  	"github.com/rookii/paicehusk"
    24  )
    25  
    26  type extractItem struct {
    27  	sequence int
    28  	score    float64
    29  	count    int
    30  	sentance string
    31  }
    32  
    33  type extractList []extractItem
    34  
    35  // the Sort interface
    36  // Len is the number of elements in the collection.
    37  func (a extractList) Len() int { return len(a) }
    38  
    39  // Less reports whether the element with
    40  // index i should sort before the element with index j.
    41  func (a extractList) Less(i, j int) bool {
    42  	return (a[i].score / float64(a[i].count)) > (a[j].score / float64(a[j].count))
    43  }
    44  
    45  // Swap swaps the elements with indexes i and j.
    46  func (a extractList) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
    47  
    48  type presentItem struct {
    49  	sequence int
    50  	text     string
    51  }
    52  
    53  type presentList []presentItem
    54  
    55  // the Sort interface
    56  // Len is the number of elements in the collection.
    57  func (a presentList) Len() int { return len(a) }
    58  
    59  // Less reports whether the element with
    60  // index i should sort before the element with index j.
    61  func (a presentList) Less(i, j int) bool {
    62  	return a[i].sequence < a[j].sequence
    63  }
    64  
    65  // Swap swaps the elements with indexes i and j.
    66  func (a presentList) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
    67  
    68  func addWd(sentance, wd string) (string, bool) {
    69  	var isStop bool
    70  	if len(sentance) == 0 {
    71  		if wd != "[" {
    72  			sentance = wd
    73  		}
    74  	} else {
    75  		switch wd {
    76  		case "[": //NoOp
    77  		case "0", "1", "2", "3", "4", "5", "6", "7", "8", "9":
    78  			if unicode.IsDigit(rune(sentance[len(sentance)-1])) {
    79  				sentance += wd
    80  			} else {
    81  				sentance += " " + wd
    82  			}
    83  		case ".", "!", "?":
    84  			isStop = true
    85  			fallthrough
    86  		default:
    87  			if isPunct(wd) {
    88  				sentance += wd
    89  			} else {
    90  				sentance += " " + wd
    91  			}
    92  		}
    93  	}
    94  	return sentance, isStop
    95  }
    96  
    97  func isPunct(s string) bool {
    98  	for _, r := range s {
    99  		if !unicode.IsPunct(r) {
   100  			switch r {
   101  			case '`', '\'', '"', '(', '/': // still punct
   102  			default:
   103  				return false
   104  			}
   105  		}
   106  	}
   107  	return true
   108  }
   109  
   110  // Excerpt returns the most statically significant 100 or so words of text for use in the Excerpt field
   111  func Excerpt(titleWords, bodyWords []string) string {
   112  	var el extractList
   113  
   114  	//fmt.Println("DEBUG Excerpt ", len(titleWords), len(bodyWords))
   115  
   116  	// populate stemMap
   117  	stemMap := make(map[string]uint64)
   118  	for _, wd := range bodyWords {
   119  		stem := paicehusk.DefaultRules.Stem(wd) // find the stem of the word
   120  		stemMap[stem]++
   121  	}
   122  	for _, wd := range titleWords {
   123  		stem := paicehusk.DefaultRules.Stem(wd) // find the stem of the word
   124  		stemMap[stem]++                         // TODO are words in titles more important?
   125  	}
   126  
   127  	wds := append(titleWords, bodyWords...)
   128  
   129  	sentance := ""
   130  	score := 0.0
   131  	count := 0
   132  	seq := 0
   133  	for _, wd := range wds {
   134  		var isStop bool
   135  
   136  		sentance, isStop = addWd(sentance, wd)
   137  
   138  		if isStop {
   139  			//fmt.Printf(" DEBUG sentance: %3d %3.2f %s\n",
   140  			//	seq, score*10000/float64(count), sentance)
   141  			var ei extractItem
   142  			ei.count = count + 1 // must be at least 1
   143  			ei.score = score
   144  			ei.sentance = sentance
   145  			ei.sequence = seq
   146  			el = append(el, ei)
   147  			sentance = ""
   148  			score = 0.0
   149  			seq++
   150  		} else {
   151  			uncommon := true
   152  			// TODO Discuss correct level or maybe find a better algorithem for this
   153  			ent, ok := words.Words[wd]
   154  			if ok {
   155  				if ent.Rank <= 100 {
   156  					// do not score very common words
   157  					uncommon = false
   158  				}
   159  			}
   160  			if uncommon {
   161  				stem := paicehusk.DefaultRules.Stem(wd) // find the stem of the word
   162  				usage, used := stemMap[stem]
   163  				if used {
   164  					relativeStemFreq := (float64(usage) / float64(len(wds))) - words.Stems[stem]
   165  					if relativeStemFreq > 0.0 {
   166  						score += relativeStemFreq
   167  					}
   168  				}
   169  				count++
   170  			}
   171  		}
   172  	}
   173  
   174  	sort.Sort(el)
   175  
   176  	return present(el)
   177  }
   178  
   179  func present(el extractList) (ret string) {
   180  	var pl presentList
   181  	words := 0
   182  
   183  	const excerptWords = 50
   184  
   185  	for s, e := range el {
   186  		if (words < excerptWords || s == 0) && len(e.sentance) > 1 &&
   187  			notEmpty(e.sentance) {
   188  			words += e.count
   189  			pl = append(pl, presentItem{sequence: e.sequence, text: e.sentance})
   190  			//fmt.Printf("DEBUG With score %3.2f on page %d // %s \n",
   191  			//	1000*e.score/float64(e.count), e.sequence, e.sentance)
   192  		}
   193  	}
   194  	sort.Sort(pl)
   195  
   196  	var lastSeq int
   197  	for p := range pl {
   198  		txt := strings.TrimPrefix(pl[p].text, ". ")
   199  		if p == 0 {
   200  			ret = txt
   201  			lastSeq = pl[0].sequence
   202  		} else {
   203  			thisSeq := pl[p].sequence
   204  			if lastSeq+1 != thisSeq {
   205  				ret += " …" // Horizontal elipsis character
   206  			}
   207  			ret += " " + txt
   208  			lastSeq = thisSeq
   209  		}
   210  	}
   211  	if len(ret) > 250 { // make sure the excerpt is not too long, shorten it if required
   212  		for len(ret) > 250 {
   213  			_, size := utf8.DecodeLastRuneInString(ret)
   214  			ret = ret[:len(ret)-size]
   215  		}
   216  		return ret + "…" // Horizontal elipsis character added after truncation
   217  	}
   218  	return ret
   219  }
   220  
   221  func notEmpty(wds string) bool {
   222  	for _, r := range wds {
   223  		if !unicode.IsPunct(r) && !unicode.IsSpace(r) {
   224  			return true
   225  		}
   226  	}
   227  	return false
   228  }