github.com/elliott5/community@v0.14.1-0.20160709191136-823126fb026a/wordsmith/wordlists/makewordlist.go (about)

     1  // Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
     2  //
     3  // This software (Documize Community Edition) is licensed under 
     4  // GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
     5  //
     6  // You can operate outside the AGPL restrictions by purchasing
     7  // Documize Enterprise Edition and obtaining a commercial license
     8  // by contacting <sales@documize.com>. 
     9  //
    10  // https://documize.com
    11  
    12  // Package main creates ordered lists of english words and their stems,
    13  // based on their frequency.
    14  package main
    15  
    16  import (
    17  	"bytes"
    18  	"fmt"
    19  	"io/ioutil"
    20  	"sort"
    21  
    22  	"github.com/rookii/paicehusk"
    23  )
    24  
    25  type wordFreqEntry struct {
    26  	rawFreq int
    27  	Freq    float64
    28  }
    29  
    30  type wordFreqMap map[string]wordFreqEntry
    31  
    32  type wordFreqSortEntry struct {
    33  	Name string
    34  	Freq float64
    35  }
    36  type wordFreqSort []wordFreqSortEntry
    37  
    38  // Len is the number of elements in the collection.
    39  func (wfs wordFreqSort) Len() int { return len(wfs) }
    40  
    41  // Less reports whether the element with
    42  // index i should sort before the element with index j.
    43  func (wfs wordFreqSort) Less(i, j int) bool { return wfs[i].Freq > wfs[j].Freq }
    44  
    45  // Swap swaps the elements with indexes i and j.
    46  func (wfs wordFreqSort) Swap(i, j int) { wfs[j], wfs[i] = wfs[i], wfs[j] }
    47  
    48  func main() {
    49  
    50  	txt, err := ioutil.ReadFile("./en-2012/en.txt")
    51  	if err != nil {
    52  		panic(err)
    53  	}
    54  
    55  	lines := bytes.Split(txt, []byte("\n"))
    56  
    57  	wfm := make(wordFreqMap)
    58  	rfTot := 0
    59  	for r, l := range lines {
    60  		words := bytes.Split(l, []byte(" "))
    61  		if len(words) >= 2 {
    62  			var rf int
    63  			_, err = fmt.Sscanf(string(words[1]), "%d", &rf)
    64  			if err == nil && len(words[0]) > 0 {
    65  				if r < 10000 { // only look at the most common 10k words, 100k makes go compile/link unworkable
    66  					stem := string(words[0]) // NOTE not stemming at present
    67  					entry, alredythere := wfm[stem]
    68  					if alredythere {
    69  						entry.rawFreq += rf
    70  						wfm[stem] = entry
    71  					} else {
    72  						wfm[stem] = wordFreqEntry{rawFreq: rf, Freq: 0.0}
    73  					}
    74  				}
    75  				rfTot += rf
    76  			}
    77  		}
    78  	}
    79  	for k, v := range wfm {
    80  		v.Freq = float64(v.rawFreq) / float64(rfTot)
    81  		wfm[k] = v
    82  	}
    83  
    84  	wfs := make(wordFreqSort, len(wfm))
    85  	idx := 0
    86  	for k, v := range wfm {
    87  		wfs[idx].Name = k
    88  		wfs[idx].Freq = v.Freq
    89  		idx++
    90  	}
    91  	sort.Sort(wfs)
    92  	writeWords(wfs, wfm)
    93  }
    94  
    95  func writeWords(wfs wordFreqSort, wfm wordFreqMap) {
    96  	var goprog bytes.Buffer
    97  	var err error
    98  
    99  	fmt.Fprintf(&goprog, `
   100  // Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
   101  //
   102  // This software (Documize Community Edition) is licensed under 
   103  // GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
   104  //
   105  // You can operate outside the AGPL restrictions by purchasing
   106  // Documize Enterprise Edition and obtaining a commercial license
   107  // by contacting <sales@documize.com>. 
   108  //
   109  // https://documize.com
   110  
   111  // Package words was auto-generated !
   112  // From base data at http://invokeit.wordpress.com/frequency-word-lists/ .
   113  // The word stems were produced using github.com/rookii/paicehusk .
   114  // DO NOT EDIT BY HAND.
   115  package words
   116  
   117  // Entry type describes the rank and frequency of a prarticular word.
   118  type Entry struct {
   119  	Rank    int      // Word Rank order, 1 most frequent.
   120  	Freq    float64  // Word Frequency, a fraction, larger is more frequent. 
   121  }
   122  
   123  // Map type provides the Entry information for each word.
   124  type Map map[string]Entry
   125  
   126  // Words gives the Entry information on the most frequent words.
   127  var Words = Map{
   128  `)
   129  	for i, v := range wfs {
   130  		fmt.Fprintf(&goprog, "\t"+`"%s": Entry{Rank:%d,Freq:%g},`+"\n", v.Name, i+1, v.Freq)
   131  	}
   132  	fmt.Fprintf(&goprog, "}\n\n")
   133  
   134  	sfm := make(map[string]float64)
   135  	for k, v := range wfm {
   136  		sfm[paicehusk.DefaultRules.Stem(k)] += v.Freq
   137  	}
   138  	fmt.Fprintf(&goprog, "// Stems gives the frequency of word-stems.\nvar Stems = map[string]float64{\n")
   139  	for k, v := range sfm {
   140  		fmt.Fprintf(&goprog, "\t"+`"%s": %g,`+"\n", k, v)
   141  	}
   142  	fmt.Fprintf(&goprog, "}\n\n")
   143  
   144  	err = ioutil.WriteFile("./en-2012/englishwords.go", goprog.Bytes(), 0666)
   145  
   146  	if err != nil {
   147  		panic(err)
   148  	}
   149  }