github.com/SupersunnySea/draft@v0.16.0/pkg/linguist/data/generate_classifier.go (about) 1 // +build ignore 2 3 /* 4 This program trains a naive bayesian classifier 5 provided by https://github.com/jbrukh/bayesian 6 on a set of source code files 7 provided by https://github.com/github/linguist 8 9 This file is meant by run by go generate, 10 refer to generate.go for its intended invokation 11 */ 12 package main 13 14 import ( 15 "container/heap" 16 "fmt" 17 "io/ioutil" 18 "log" 19 "os" 20 "runtime" 21 22 "github.com/Azure/draft/pkg/linguist/tokenizer" 23 "github.com/jbrukh/bayesian" 24 ) 25 26 type sampleFile struct { 27 lang, fp string 28 tokens []string 29 } 30 31 func main() { 32 const ( 33 sourcePath = "./linguist/samples" 34 outfile = "./classifier" 35 quiet = false 36 ) 37 38 log.SetFlags(0) 39 if quiet { 40 log.SetOutput(ioutil.Discard) 41 } 42 43 // first we only read all the paths of the sample files 44 // and their corresponding and language names into: 45 sampleFiles := []*sampleFile{} 46 // and store all the language names into: 47 languages := []string{} 48 49 /* 50 github/linguist has directory structure: 51 52 ... 53 ├── samples 54 │ ├── (name of programming language) 55 │ │ ├── (sample file in language) 56 │ │ ├── (sample file in language) 57 │ │ └── (sample file in language) 58 │ ├── (name of another programming language) 59 │ │ └── (sample file) 60 ... 61 62 the following hard-coded logic expects this layout 63 */ 64 65 log.Println("Scanning", sourcePath, "...") 66 srcDir, err := os.Open(sourcePath) 67 checkErr(err) 68 69 subDirs, err := srcDir.Readdir(-1) 70 checkErr(err) 71 72 for _, langDir := range subDirs { 73 lang := langDir.Name() 74 if !langDir.IsDir() { 75 log.Println("unexpected file:", lang) 76 continue 77 } 78 79 languages = append(languages, lang) 80 81 samplePath := sourcePath + "/" + lang 82 sampleDir, err := os.Open(samplePath) 83 checkErr(err) 84 files, err := sampleDir.Readdir(-1) 85 checkErr(err) 86 for _, file := range files { 87 fp := samplePath + "/" + file.Name() 88 if file.IsDir() { 89 // Skip subdirectories 90 continue 91 } 92 sampleFiles = append(sampleFiles, &sampleFile{lang, fp, nil}) 93 } 94 sampleDir.Close() 95 } 96 log.Println("Found", len(languages), "languages in", len(sampleFiles), "files") 97 98 // simple progress bar 99 progress := 0.0 100 total := float64(len(sampleFiles)) * 2.0 101 progressBar := func() { 102 progress++ 103 fmt.Printf("Processing files ... %.2f%%\r", progress/total*100.0) 104 } 105 106 // then we concurrently read and tokenize the samples 107 sampleChan := make(chan *sampleFile) 108 readyChan := make(chan struct{}) 109 received := 0 110 tokenize := func(s *sampleFile) { 111 f, err := os.Open(s.fp) 112 checkErr(err) 113 contents, err := ioutil.ReadAll(f) 114 f.Close() 115 checkErr(err) 116 s.tokens = tokenizer.Tokenize(contents) 117 sampleChan <- s 118 } 119 dox := map[string][]string{} 120 for _, lang := range languages { 121 dox[lang] = []string{} 122 } 123 // this receives the processed files and stores their tokens with their language 124 go func() { 125 for { 126 s := <-sampleChan 127 dox[s.lang] = append(dox[s.lang], s.tokens...) 128 received++ 129 progressBar() 130 if received == len(sampleFiles) { 131 close(readyChan) 132 return 133 } 134 } 135 }() 136 137 // this balances the workload (implementation at end of file) 138 requests := getRequestsChan(len(sampleFiles)) 139 for i := range sampleFiles { 140 requests <- &request{ 141 workFn: tokenize, 142 arg: sampleFiles[i], 143 } 144 progressBar() 145 } 146 147 // once that's done 148 <-readyChan 149 close(requests) 150 fmt.Println() // for the progress bar 151 152 // we train the classifier in the arbitrary manner that its API demands 153 classes := make([]bayesian.Class, 1) 154 documents := make(map[bayesian.Class][]string) 155 for _, lang := range languages { 156 var class = bayesian.Class(lang) 157 classes = append(classes, class) 158 documents[class] = dox[lang] 159 } 160 log.Println("Creating bayesian.Classifier ...") 161 clsf := bayesian.NewClassifier(classes...) 162 for cls, dox := range documents { 163 clsf.Learn(dox, cls) 164 } 165 166 // and write the data to disk 167 log.Println("Serializing and exporting bayesian.Classifier to", outfile, "...") 168 checkErr(clsf.WriteToFile("classifier")) 169 170 log.Println("Done.") 171 } 172 func checkErr(err error) { 173 if err != nil { 174 log.Panicln(err) 175 } 176 } 177 178 // simple load balancer from "concurrency is not parallelism" talk 179 type request struct { 180 workFn func(s *sampleFile) 181 arg *sampleFile 182 } 183 type worker struct { 184 requests chan *request 185 pending, index int 186 } 187 188 func (w *worker) work(done chan *worker) { 189 for { 190 req := <-w.requests 191 req.workFn(req.arg) 192 done <- w 193 } 194 } 195 196 type pool []*worker 197 198 func (p pool) Less(i, j int) bool { return p[i].pending < p[j].pending } 199 func (p pool) Len() int { return len(p) } 200 func (p pool) Swap(i, j int) { p[i], p[j] = p[j], p[i] } 201 func (p *pool) Push(x interface{}) { *p = append(*p, x.(*worker)) } 202 func (p *pool) Pop() interface{} { 203 old := *p 204 n := len(old) 205 x := old[n-1] 206 *p = old[0 : n-1] 207 return x 208 } 209 210 type balancer struct { 211 workers pool 212 done chan *worker 213 } 214 215 func (b *balancer) balance(work chan *request) { 216 for { 217 select { 218 case req, ok := <-work: 219 if ok { 220 b.dispatch(req) 221 } else { 222 return 223 } 224 case w := <-b.done: 225 b.completed(w) 226 } 227 } 228 } 229 func (b *balancer) dispatch(req *request) { 230 w := heap.Pop(&b.workers).(*worker) 231 w.requests <- req 232 w.pending++ 233 heap.Push(&b.workers, w) 234 } 235 func (b *balancer) completed(w *worker) { 236 w.pending-- 237 heap.Remove(&b.workers, w.index) 238 heap.Push(&b.workers, w) 239 } 240 func getRequestsChan(jobs int) chan *request { 241 done := make(chan *worker) 242 workers := make(pool, runtime.GOMAXPROCS(0)*4) // I don't know how many workers there should be 243 for i := 0; i < len(workers); i++ { 244 w := &worker{make(chan *request, jobs), 0, i} 245 go w.work(done) 246 workers[i] = w 247 } 248 heap.Init(&workers) 249 b := &balancer{workers, done} 250 requests := make(chan *request) 251 go b.balance(requests) 252 return requests 253 }