github.com/elliott5/community@v0.14.1-0.20160709191136-823126fb026a/wordsmith/wordlists/makewordlist.go (about) 1 // Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved. 2 // 3 // This software (Documize Community Edition) is licensed under 4 // GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html 5 // 6 // You can operate outside the AGPL restrictions by purchasing 7 // Documize Enterprise Edition and obtaining a commercial license 8 // by contacting <sales@documize.com>. 9 // 10 // https://documize.com 11 12 // Package main creates ordered lists of english words and their stems, 13 // based on their frequency. 14 package main 15 16 import ( 17 "bytes" 18 "fmt" 19 "io/ioutil" 20 "sort" 21 22 "github.com/rookii/paicehusk" 23 ) 24 25 type wordFreqEntry struct { 26 rawFreq int 27 Freq float64 28 } 29 30 type wordFreqMap map[string]wordFreqEntry 31 32 type wordFreqSortEntry struct { 33 Name string 34 Freq float64 35 } 36 type wordFreqSort []wordFreqSortEntry 37 38 // Len is the number of elements in the collection. 39 func (wfs wordFreqSort) Len() int { return len(wfs) } 40 41 // Less reports whether the element with 42 // index i should sort before the element with index j. 43 func (wfs wordFreqSort) Less(i, j int) bool { return wfs[i].Freq > wfs[j].Freq } 44 45 // Swap swaps the elements with indexes i and j. 46 func (wfs wordFreqSort) Swap(i, j int) { wfs[j], wfs[i] = wfs[i], wfs[j] } 47 48 func main() { 49 50 txt, err := ioutil.ReadFile("./en-2012/en.txt") 51 if err != nil { 52 panic(err) 53 } 54 55 lines := bytes.Split(txt, []byte("\n")) 56 57 wfm := make(wordFreqMap) 58 rfTot := 0 59 for r, l := range lines { 60 words := bytes.Split(l, []byte(" ")) 61 if len(words) >= 2 { 62 var rf int 63 _, err = fmt.Sscanf(string(words[1]), "%d", &rf) 64 if err == nil && len(words[0]) > 0 { 65 if r < 10000 { // only look at the most common 10k words, 100k makes go compile/link unworkable 66 stem := string(words[0]) // NOTE not stemming at present 67 entry, alredythere := wfm[stem] 68 if alredythere { 69 entry.rawFreq += rf 70 wfm[stem] = entry 71 } else { 72 wfm[stem] = wordFreqEntry{rawFreq: rf, Freq: 0.0} 73 } 74 } 75 rfTot += rf 76 } 77 } 78 } 79 for k, v := range wfm { 80 v.Freq = float64(v.rawFreq) / float64(rfTot) 81 wfm[k] = v 82 } 83 84 wfs := make(wordFreqSort, len(wfm)) 85 idx := 0 86 for k, v := range wfm { 87 wfs[idx].Name = k 88 wfs[idx].Freq = v.Freq 89 idx++ 90 } 91 sort.Sort(wfs) 92 writeWords(wfs, wfm) 93 } 94 95 func writeWords(wfs wordFreqSort, wfm wordFreqMap) { 96 var goprog bytes.Buffer 97 var err error 98 99 fmt.Fprintf(&goprog, ` 100 // Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved. 101 // 102 // This software (Documize Community Edition) is licensed under 103 // GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html 104 // 105 // You can operate outside the AGPL restrictions by purchasing 106 // Documize Enterprise Edition and obtaining a commercial license 107 // by contacting <sales@documize.com>. 108 // 109 // https://documize.com 110 111 // Package words was auto-generated ! 112 // From base data at http://invokeit.wordpress.com/frequency-word-lists/ . 113 // The word stems were produced using github.com/rookii/paicehusk . 114 // DO NOT EDIT BY HAND. 115 package words 116 117 // Entry type describes the rank and frequency of a prarticular word. 118 type Entry struct { 119 Rank int // Word Rank order, 1 most frequent. 120 Freq float64 // Word Frequency, a fraction, larger is more frequent. 121 } 122 123 // Map type provides the Entry information for each word. 124 type Map map[string]Entry 125 126 // Words gives the Entry information on the most frequent words. 127 var Words = Map{ 128 `) 129 for i, v := range wfs { 130 fmt.Fprintf(&goprog, "\t"+`"%s": Entry{Rank:%d,Freq:%g},`+"\n", v.Name, i+1, v.Freq) 131 } 132 fmt.Fprintf(&goprog, "}\n\n") 133 134 sfm := make(map[string]float64) 135 for k, v := range wfm { 136 sfm[paicehusk.DefaultRules.Stem(k)] += v.Freq 137 } 138 fmt.Fprintf(&goprog, "// Stems gives the frequency of word-stems.\nvar Stems = map[string]float64{\n") 139 for k, v := range sfm { 140 fmt.Fprintf(&goprog, "\t"+`"%s": %g,`+"\n", k, v) 141 } 142 fmt.Fprintf(&goprog, "}\n\n") 143 144 err = ioutil.WriteFile("./en-2012/englishwords.go", goprog.Bytes(), 0666) 145 146 if err != nil { 147 panic(err) 148 } 149 }