github.com/src-d/enry@v1.7.3/internal/code-generator/generator/samplesfreq.go (about)

     1  package generator
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"io"
     7  	"io/ioutil"
     8  	"log"
     9  	"math"
    10  	"path/filepath"
    11  	"sort"
    12  	"strconv"
    13  	"text/template"
    14  
    15  	"gopkg.in/src-d/enry.v1/internal/tokenizer"
    16  )
    17  
    18  type samplesFrequencies struct {
    19  	LanguageTotal  int                       `json:"language_total,omitempty"`
    20  	Languages      map[string]int            `json:"languages,omitempty"`
    21  	TokensTotal    int                       `json:"tokens_total,omitempty"`
    22  	Tokens         map[string]map[string]int `json:"tokens,omitempty"`
    23  	LanguageTokens map[string]int            `json:"language_tokens,omitempty"`
    24  }
    25  
    26  // Frequencies reads directories in samplesDir, retrieves information about frequencies of languages and tokens, and write
    27  // the file outPath using tmplName as a template. It complies with type File signature.
    28  func Frequencies(fileToParse, samplesDir, outPath, tmplPath, tmplName, commit string) error {
    29  	freqs, err := getFrequencies(samplesDir)
    30  	if err != nil {
    31  		return err
    32  	}
    33  
    34  	buf := &bytes.Buffer{}
    35  	if err := executeFrequenciesTemplate(buf, freqs, tmplPath, tmplName, commit); err != nil {
    36  		return err
    37  	}
    38  
    39  	return formatedWrite(outPath, buf.Bytes())
    40  }
    41  
    42  func getFrequencies(samplesDir string) (*samplesFrequencies, error) {
    43  	langDirs, err := ioutil.ReadDir(samplesDir)
    44  	if err != nil {
    45  		return nil, err
    46  	}
    47  
    48  	var languageTotal int
    49  	var languages = make(map[string]int)
    50  	var tokensTotal int
    51  	var tokens = make(map[string]map[string]int)
    52  	var languageTokens = make(map[string]int)
    53  
    54  	for _, langDir := range langDirs {
    55  		if !langDir.IsDir() {
    56  			continue
    57  		}
    58  
    59  		lang := langDir.Name()
    60  		samples, err := readSamples(filepath.Join(samplesDir, lang))
    61  		if err != nil {
    62  			log.Println(err)
    63  		}
    64  
    65  		if len(samples) == 0 {
    66  			continue
    67  		}
    68  
    69  		samplesTokens, err := getTokens(samples)
    70  		if err != nil {
    71  			log.Println(err)
    72  			continue
    73  		}
    74  
    75  		languageTotal += len(samples)
    76  		languages[lang] = len(samples)
    77  		tokensTotal += len(samplesTokens)
    78  		languageTokens[lang] = len(samplesTokens)
    79  		tokens[lang] = make(map[string]int)
    80  		for _, token := range samplesTokens {
    81  			tokens[lang][token]++
    82  		}
    83  	}
    84  
    85  	return &samplesFrequencies{
    86  		TokensTotal:    tokensTotal,
    87  		LanguageTotal:  languageTotal,
    88  		Tokens:         tokens,
    89  		LanguageTokens: languageTokens,
    90  		Languages:      languages,
    91  	}, nil
    92  }
    93  
    94  func readSamples(samplesLangDir string) ([]string, error) {
    95  	const samplesLangFilesDir = "filenames"
    96  	sampleFiles, err := ioutil.ReadDir(samplesLangDir)
    97  	if err != nil {
    98  		return nil, err
    99  	}
   100  
   101  	var samples []string
   102  	for _, sampleFile := range sampleFiles {
   103  		filename := filepath.Join(samplesLangDir, sampleFile.Name())
   104  		if sampleFile.Mode().IsRegular() {
   105  			samples = append(samples, filename)
   106  			continue
   107  		}
   108  
   109  		if sampleFile.IsDir() && sampleFile.Name() == samplesLangFilesDir {
   110  			subSamples, err := readSubSamples(filename)
   111  			if err != nil {
   112  				return nil, err
   113  			}
   114  
   115  			samples = append(samples, subSamples...)
   116  		}
   117  
   118  	}
   119  
   120  	return samples, nil
   121  }
   122  
   123  func readSubSamples(path string) ([]string, error) {
   124  	subSamples := []string{}
   125  	entries, err := ioutil.ReadDir(path)
   126  	if err != nil {
   127  		return nil, err
   128  	}
   129  
   130  	for _, entry := range entries {
   131  		if entry.Mode().IsRegular() {
   132  			subSamples = append(subSamples, filepath.Join(path, entry.Name()))
   133  		}
   134  	}
   135  
   136  	return subSamples, nil
   137  }
   138  
   139  func getTokens(samples []string) ([]string, error) {
   140  	tokens := make([]string, 0, 20)
   141  	var anyError error
   142  	for _, sample := range samples {
   143  		content, err := ioutil.ReadFile(sample)
   144  		if err != nil {
   145  			anyError = err
   146  			continue
   147  		}
   148  
   149  		t := tokenizer.Tokenize(content)
   150  		tokens = append(tokens, t...)
   151  	}
   152  
   153  	return tokens, anyError
   154  }
   155  
   156  func executeFrequenciesTemplate(out io.Writer, freqs *samplesFrequencies, tmplPath, tmplName, commit string) error {
   157  	fmap := template.FuncMap{
   158  		"toFloat64": func(num int) string { return fmt.Sprintf("%f", float64(num)) },
   159  		"orderKeys": func(m map[string]int) []string {
   160  			keys := make([]string, 0, len(m))
   161  			for key := range m {
   162  				keys = append(keys, key)
   163  			}
   164  
   165  			sort.Strings(keys)
   166  			return keys
   167  		},
   168  		"languageLogProbability": func(language string) string {
   169  			num := math.Log(float64(freqs.Languages[language]) / float64(freqs.LanguageTotal))
   170  			return fmt.Sprintf("%f", num)
   171  		},
   172  		"orderMapMapKeys": func(mm map[string]map[string]int) []string {
   173  			keys := make([]string, 0, len(mm))
   174  			for key := range mm {
   175  				keys = append(keys, key)
   176  			}
   177  
   178  			sort.Strings(keys)
   179  			return keys
   180  		},
   181  		"tokenLogProbability": func(language, token string) string {
   182  			num := math.Log(float64(freqs.Tokens[language][token]) / float64(freqs.LanguageTokens[language]))
   183  			return fmt.Sprintf("%f", num)
   184  		},
   185  		"quote": strconv.Quote,
   186  	}
   187  	return executeTemplate(out, tmplName, tmplPath, commit, fmap, freqs)
   188  }