github.com/dennwc/enry@v1.6.4-0.20180424151738-42391b8e105b/internal/code-generator/generator/samplesfreq.go (about)

     1  package generator
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"io"
     7  	"io/ioutil"
     8  	"log"
     9  	"math"
    10  	"os"
    11  	"path/filepath"
    12  	"sort"
    13  	"strconv"
    14  	"text/template"
    15  
    16  	"gopkg.in/src-d/enry.v1/internal/tokenizer"
    17  )
    18  
    19  type samplesFrequencies struct {
    20  	LanguageTotal  int                       `json:"language_total,omitempty"`
    21  	Languages      map[string]int            `json:"languages,omitempty"`
    22  	TokensTotal    int                       `json:"tokens_total,omitempty"`
    23  	Tokens         map[string]map[string]int `json:"tokens,omitempty"`
    24  	LanguageTokens map[string]int            `json:"language_tokens,omitempty"`
    25  }
    26  
    27  // Frequencies reads directories in samplesDir, retrieves information about frequencies of languages and tokens, and write
    28  // the file outPath using tmplName as a template. It complies with type File signature.
    29  func Frequencies(fileToParse, samplesDir, outPath, tmplPath, tmplName, commit string) error {
    30  	freqs, err := getFrequencies(samplesDir)
    31  	if err != nil {
    32  		return err
    33  	}
    34  
    35  	buf := &bytes.Buffer{}
    36  	if err := executeFrequenciesTemplate(buf, freqs, tmplPath, tmplName, commit); err != nil {
    37  		return err
    38  	}
    39  
    40  	return formatedWrite(outPath, buf.Bytes())
    41  }
    42  
    43  func getFrequencies(samplesDir string) (*samplesFrequencies, error) {
    44  	entries, err := ioutil.ReadDir(samplesDir)
    45  	if err != nil {
    46  		return nil, err
    47  	}
    48  
    49  	var languageTotal int
    50  	var languages = make(map[string]int)
    51  	var tokensTotal int
    52  	var tokens = make(map[string]map[string]int)
    53  	var languageTokens = make(map[string]int)
    54  
    55  	for _, entry := range entries {
    56  		if !entry.IsDir() {
    57  			log.Println(err)
    58  			continue
    59  		}
    60  
    61  		samples, err := getSamples(samplesDir, entry)
    62  		if err != nil {
    63  			log.Println(err)
    64  		}
    65  
    66  		if len(samples) == 0 {
    67  			continue
    68  		}
    69  
    70  		samplesTokens, err := getTokens(samples)
    71  		if err != nil {
    72  			log.Println(err)
    73  			continue
    74  		}
    75  
    76  		lang := entry.Name()
    77  		languageTotal += len(samples)
    78  		languages[lang] = len(samples)
    79  		tokensTotal += len(samplesTokens)
    80  		languageTokens[lang] = len(samplesTokens)
    81  		tokens[lang] = make(map[string]int)
    82  		for _, token := range samplesTokens {
    83  			tokens[lang][token]++
    84  		}
    85  	}
    86  
    87  	return &samplesFrequencies{
    88  		TokensTotal:    tokensTotal,
    89  		LanguageTotal:  languageTotal,
    90  		Tokens:         tokens,
    91  		LanguageTokens: languageTokens,
    92  		Languages:      languages,
    93  	}, nil
    94  }
    95  
    96  func getSamples(samplesDir string, langDir os.FileInfo) ([]string, error) {
    97  	const samplesSubDir = "filenames"
    98  	samples := []string{}
    99  	path := filepath.Join(samplesDir, langDir.Name())
   100  	entries, err := ioutil.ReadDir(path)
   101  	if err != nil {
   102  		return nil, err
   103  	}
   104  
   105  	for _, entry := range entries {
   106  		if entry.Mode().IsRegular() {
   107  			samples = append(samples, filepath.Join(path, entry.Name()))
   108  		}
   109  
   110  		if entry.IsDir() && entry.Name() == samplesSubDir {
   111  			subSamples, err := getSubSamples(samplesDir, langDir.Name(), entry)
   112  			if err != nil {
   113  				return nil, err
   114  			}
   115  
   116  			samples = append(samples, subSamples...)
   117  		}
   118  
   119  	}
   120  
   121  	return samples, nil
   122  }
   123  
   124  func getSubSamples(samplesDir, langDir string, subLangDir os.FileInfo) ([]string, error) {
   125  	subSamples := []string{}
   126  	path := filepath.Join(samplesDir, langDir, subLangDir.Name())
   127  	entries, err := ioutil.ReadDir(path)
   128  	if err != nil {
   129  		return nil, err
   130  	}
   131  
   132  	for _, entry := range entries {
   133  		if entry.Mode().IsRegular() {
   134  			subSamples = append(subSamples, filepath.Join(path, entry.Name()))
   135  		}
   136  	}
   137  
   138  	return subSamples, nil
   139  }
   140  
   141  func getTokens(samples []string) ([]string, error) {
   142  	tokens := make([]string, 0, 20)
   143  	var anyError error
   144  	for _, sample := range samples {
   145  		content, err := ioutil.ReadFile(sample)
   146  		if err != nil {
   147  			anyError = err
   148  			continue
   149  		}
   150  
   151  		t := tokenizer.Tokenize(content)
   152  		tokens = append(tokens, t...)
   153  	}
   154  
   155  	return tokens, anyError
   156  }
   157  
   158  func executeFrequenciesTemplate(out io.Writer, freqs *samplesFrequencies, tmplPath, tmplName, commit string) error {
   159  	fmap := template.FuncMap{
   160  		"getCommit": func() string { return commit },
   161  		"toFloat64": func(num int) string { return fmt.Sprintf("%f", float64(num)) },
   162  		"orderKeys": func(m map[string]int) []string {
   163  			keys := make([]string, 0, len(m))
   164  			for key := range m {
   165  				keys = append(keys, key)
   166  			}
   167  
   168  			sort.Strings(keys)
   169  			return keys
   170  		},
   171  		"languageLogProbability": func(language string) string {
   172  			num := math.Log(float64(freqs.Languages[language]) / float64(freqs.LanguageTotal))
   173  			return fmt.Sprintf("%f", num)
   174  		},
   175  		"orderMapMapKeys": func(mm map[string]map[string]int) []string {
   176  			keys := make([]string, 0, len(mm))
   177  			for key := range mm {
   178  				keys = append(keys, key)
   179  			}
   180  
   181  			sort.Strings(keys)
   182  			return keys
   183  		},
   184  		"tokenLogProbability": func(language, token string) string {
   185  			num := math.Log(float64(freqs.Tokens[language][token]) / float64(freqs.LanguageTokens[language]))
   186  			return fmt.Sprintf("%f", num)
   187  		},
   188  		"quote": strconv.Quote,
   189  	}
   190  
   191  	t := template.Must(template.New(tmplName).Funcs(fmap).ParseFiles(tmplPath))
   192  	if err := t.Execute(out, freqs); err != nil {
   193  		return err
   194  	}
   195  
   196  	return nil
   197  }