github.com/xzntrc/go-enry/v2@v2.0.0-20230215091818-766cc1d65498/internal/code-generator/generator/frequencies.go (about)

     1  package generator
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"io"
     7  	"io/ioutil"
     8  	"log"
     9  	"math"
    10  	"os"
    11  	"path/filepath"
    12  	"sort"
    13  	"strconv"
    14  	"strings"
    15  	"text/template"
    16  
    17  	"github.com/go-enry/go-enry/v2/internal/tokenizer"
    18  )
    19  
    20  type samplesFrequencies struct {
    21  	LanguageTotal  int                       `json:"language_total,omitempty"`
    22  	Languages      map[string]int            `json:"languages,omitempty"`
    23  	TokensTotal    int                       `json:"tokens_total,omitempty"`
    24  	Tokens         map[string]map[string]int `json:"tokens,omitempty"`
    25  	LanguageTokens map[string]int            `json:"language_tokens,omitempty"`
    26  }
    27  
    28  // Frequencies reads directories in samplesDir, retrieves information about frequencies of languages and tokens, and write
    29  // the file outPath using tmplName as a template. It complies with type File signature.
    30  func Frequencies(fileToParse, samplesDir, outPath, tmplPath, tmplName, commit string) error {
    31  	freqs, err := getFrequencies(samplesDir)
    32  	if err != nil {
    33  		return err
    34  	}
    35  
    36  	if _, ok := os.LookupEnv("ENRY_DEBUG"); ok {
    37  		log.Printf("Total samples: %d\n", freqs.LanguageTotal)
    38  		log.Printf("Total tokens: %d\n", freqs.TokensTotal)
    39  
    40  		keys := make([]string, 0, len(freqs.Languages))
    41  		for k := range freqs.Languages {
    42  			keys = append(keys, k)
    43  		}
    44  		sort.Strings(keys)
    45  
    46  		for _, k := range keys {
    47  			fmt.Printf(" %s: %d\n", k, freqs.Languages[k])
    48  		}
    49  	}
    50  
    51  	buf := &bytes.Buffer{}
    52  	if err := executeFrequenciesTemplate(buf, freqs, tmplPath, tmplName, commit); err != nil {
    53  		return err
    54  	}
    55  
    56  	return formatedWrite(outPath, buf.Bytes())
    57  }
    58  
    59  func getFrequencies(samplesDir string) (*samplesFrequencies, error) {
    60  	langDirs, err := ioutil.ReadDir(samplesDir)
    61  	if err != nil {
    62  		return nil, err
    63  	}
    64  
    65  	var languageTotal int
    66  	var languages = make(map[string]int)
    67  	var tokensTotal int
    68  	var tokens = make(map[string]map[string]int)
    69  	var languageTokens = make(map[string]int)
    70  
    71  	for _, langDir := range langDirs {
    72  		if !langDir.IsDir() {
    73  			continue
    74  		}
    75  
    76  		lang := langDir.Name()
    77  		samples, err := readSamples(filepath.Join(samplesDir, lang))
    78  		if err != nil {
    79  			log.Println(err)
    80  		}
    81  
    82  		if len(samples) == 0 {
    83  			continue
    84  		}
    85  
    86  		samplesTokens, err := getTokens(samples)
    87  		if err != nil {
    88  			log.Println(err)
    89  			continue
    90  		}
    91  
    92  		languageTotal += len(samples)
    93  		languages[lang] = len(samples)
    94  		tokensTotal += len(samplesTokens)
    95  		languageTokens[lang] = len(samplesTokens)
    96  		tokens[lang] = make(map[string]int)
    97  		for _, token := range samplesTokens {
    98  			tokens[lang][token]++
    99  		}
   100  	}
   101  
   102  	return &samplesFrequencies{
   103  		TokensTotal:    tokensTotal,
   104  		LanguageTotal:  languageTotal,
   105  		Tokens:         tokens,
   106  		LanguageTokens: languageTokens,
   107  		Languages:      languages,
   108  	}, nil
   109  }
   110  
   111  // readSamples collects ./samples/ filenames from the Linguist codebase, skipping symlinks.
   112  func readSamples(samplesLangDir string) ([]string, error) {
   113  	const specialSubDir = "filenames"
   114  	var samples []string
   115  
   116  	err := filepath.Walk(samplesLangDir, func(path string, info os.FileInfo, err error) error {
   117  		if err != nil {
   118  			fmt.Printf("failure accessing a path %q: %v\n", path, err)
   119  			return err
   120  		}
   121  		if info.IsDir() {
   122  			switch info.Name() {
   123  			case filepath.Base(samplesLangDir):
   124  				return nil
   125  			case specialSubDir:
   126  				return nil
   127  			default:
   128  				return filepath.SkipDir
   129  			}
   130  		}
   131  		// skip git file symlinks on win and *nix
   132  		if isKnownSymlinkInLinguist(path) || !info.Mode().IsRegular() {
   133  			return nil
   134  		}
   135  		samples = append(samples, path)
   136  		return nil
   137  	})
   138  
   139  	return samples, err
   140  }
   141  
   142  // isKnownSymlinkInLinguist checks if the file name is on the list of known symlinks.
   143  // On Windows, there is no symlink support in Git [1] and those become regular text files,
   144  // so we have to skip these files manually, maintaining a list here :/
   145  //  1. https://github.com/git-for-windows/git/wiki/Symbolic-Links
   146  //
   147  // $ find -L .linguist/samples -xtype l
   148  func isKnownSymlinkInLinguist(path string) bool {
   149  	return strings.HasSuffix(path, filepath.Join("Ant Build System", "filenames", "build.xml")) ||
   150  		strings.HasSuffix(path, filepath.Join("Markdown", "symlink.md"))
   151  }
   152  
   153  func getTokens(samples []string) ([]string, error) {
   154  	tokens := make([]string, 0, 20)
   155  	var anyError error
   156  	for _, sample := range samples {
   157  		content, err := ioutil.ReadFile(sample)
   158  		if err != nil {
   159  			anyError = err
   160  			continue
   161  		}
   162  
   163  		t := tokenizer.Tokenize(content)
   164  		tokens = append(tokens, t...)
   165  	}
   166  
   167  	return tokens, anyError
   168  }
   169  
   170  func executeFrequenciesTemplate(out io.Writer, freqs *samplesFrequencies, tmplPath, tmplName, commit string) error {
   171  	fmap := template.FuncMap{
   172  		"toFloat64": func(num int) string { return fmt.Sprintf("%f", float64(num)) },
   173  		"orderKeys": func(m map[string]int) []string {
   174  			keys := make([]string, 0, len(m))
   175  			for key := range m {
   176  				keys = append(keys, key)
   177  			}
   178  
   179  			sort.Strings(keys)
   180  			return keys
   181  		},
   182  		"languageLogProbability": func(language string) string {
   183  			num := math.Log(float64(freqs.Languages[language]) / float64(freqs.LanguageTotal))
   184  			return fmt.Sprintf("%f", num)
   185  		},
   186  		"orderMapMapKeys": func(mm map[string]map[string]int) []string {
   187  			keys := make([]string, 0, len(mm))
   188  			for key := range mm {
   189  				keys = append(keys, key)
   190  			}
   191  
   192  			sort.Strings(keys)
   193  			return keys
   194  		},
   195  		"tokenLogProbability": func(language, token string) string {
   196  			num := math.Log(float64(freqs.Tokens[language][token]) / float64(freqs.LanguageTokens[language]))
   197  			return fmt.Sprintf("%f", num)
   198  		},
   199  		"quote": strconv.Quote,
   200  	}
   201  	return executeTemplate(out, tmplName, tmplPath, commit, fmap, freqs)
   202  }