github.com/gitbundle/modules@v0.0.0-20231025071548-85b91c5c3b01/git/repo_language_stats_nogogit.go (about)

     1  // Copyright 2023 The GitBundle Inc. All rights reserved.
     2  // Copyright 2017 The Gitea Authors. All rights reserved.
     3  // Use of this source code is governed by a MIT-style
     4  // license that can be found in the LICENSE file.
     5  
     6  //go:build !gogit
     7  
     8  package git
     9  
    10  import (
    11  	"bufio"
    12  	"bytes"
    13  	"io"
    14  	"math"
    15  	"strings"
    16  
    17  	"github.com/gitbundle/modules/analyze"
    18  	"github.com/gitbundle/modules/log"
    19  
    20  	"github.com/go-enry/go-enry/v2"
    21  )
    22  
    23  // GetLanguageStats calculates language stats for git repository at specified commit
    24  func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, error) {
    25  	// We will feed the commit IDs in order into cat-file --batch, followed by blobs as necessary.
    26  	// so let's create a batch stdin and stdout
    27  	batchStdinWriter, batchReader, cancel := repo.CatFileBatch(repo.Ctx)
    28  	defer cancel()
    29  
    30  	writeID := func(id string) error {
    31  		_, err := batchStdinWriter.Write([]byte(id + "\n"))
    32  		return err
    33  	}
    34  
    35  	if err := writeID(commitID); err != nil {
    36  		return nil, err
    37  	}
    38  	shaBytes, typ, size, err := ReadBatchLine(batchReader)
    39  	if typ != "commit" {
    40  		log.Debug("Unable to get commit for: %s. Err: %v", commitID, err)
    41  		return nil, ErrNotExist{commitID, ""}
    42  	}
    43  
    44  	sha, err := NewIDFromString(string(shaBytes))
    45  	if err != nil {
    46  		log.Debug("Unable to get commit for: %s. Err: %v", commitID, err)
    47  		return nil, ErrNotExist{commitID, ""}
    48  	}
    49  
    50  	commit, err := CommitFromReader(repo, sha, io.LimitReader(batchReader, size))
    51  	if err != nil {
    52  		log.Debug("Unable to get commit for: %s. Err: %v", commitID, err)
    53  		return nil, err
    54  	}
    55  	if _, err = batchReader.Discard(1); err != nil {
    56  		return nil, err
    57  	}
    58  
    59  	tree := commit.Tree
    60  
    61  	entries, err := tree.ListEntriesRecursive()
    62  	if err != nil {
    63  		return nil, err
    64  	}
    65  
    66  	checker, deferable := repo.CheckAttributeReader(commitID)
    67  	defer deferable()
    68  
    69  	contentBuf := bytes.Buffer{}
    70  	var content []byte
    71  	sizes := make(map[string]int64)
    72  	for _, f := range entries {
    73  		select {
    74  		case <-repo.Ctx.Done():
    75  			return sizes, repo.Ctx.Err()
    76  		default:
    77  		}
    78  
    79  		contentBuf.Reset()
    80  		content = contentBuf.Bytes()
    81  
    82  		if f.Size() == 0 {
    83  			continue
    84  		}
    85  
    86  		notVendored := false
    87  		notGenerated := false
    88  
    89  		if checker != nil {
    90  			attrs, err := checker.CheckPath(f.Name())
    91  			if err == nil {
    92  				if vendored, has := attrs["linguist-vendored"]; has {
    93  					if vendored == "set" || vendored == "true" {
    94  						continue
    95  					}
    96  					notVendored = vendored == "false"
    97  				}
    98  				if generated, has := attrs["linguist-generated"]; has {
    99  					if generated == "set" || generated == "true" {
   100  						continue
   101  					}
   102  					notGenerated = generated == "false"
   103  				}
   104  				if language, has := attrs["linguist-language"]; has && language != "unspecified" && language != "" {
   105  					// group languages, such as Pug -> HTML; SCSS -> CSS
   106  					group := enry.GetLanguageGroup(language)
   107  					if len(group) != 0 {
   108  						language = group
   109  					}
   110  
   111  					sizes[language] += f.Size()
   112  					continue
   113  				} else if language, has := attrs["gitlab-language"]; has && language != "unspecified" && language != "" {
   114  					// strip off a ? if present
   115  					if idx := strings.IndexByte(language, '?'); idx >= 0 {
   116  						language = language[:idx]
   117  					}
   118  					if len(language) != 0 {
   119  						// group languages, such as Pug -> HTML; SCSS -> CSS
   120  						group := enry.GetLanguageGroup(language)
   121  						if len(group) != 0 {
   122  							language = group
   123  						}
   124  
   125  						sizes[language] += f.Size()
   126  						continue
   127  					}
   128  				}
   129  
   130  			}
   131  		}
   132  
   133  		if (!notVendored && analyze.IsVendor(f.Name())) || enry.IsDotFile(f.Name()) ||
   134  			enry.IsDocumentation(f.Name()) || enry.IsConfiguration(f.Name()) {
   135  			continue
   136  		}
   137  
   138  		// If content can not be read or file is too big just do detection by filename
   139  
   140  		if f.Size() <= bigFileSize {
   141  			if err := writeID(f.ID.String()); err != nil {
   142  				return nil, err
   143  			}
   144  			_, _, size, err := ReadBatchLine(batchReader)
   145  			if err != nil {
   146  				log.Debug("Error reading blob: %s Err: %v", f.ID.String(), err)
   147  				return nil, err
   148  			}
   149  
   150  			sizeToRead := size
   151  			discard := int64(1)
   152  			if size > fileSizeLimit {
   153  				sizeToRead = fileSizeLimit
   154  				discard = size - fileSizeLimit + 1
   155  			}
   156  
   157  			_, err = contentBuf.ReadFrom(io.LimitReader(batchReader, sizeToRead))
   158  			if err != nil {
   159  				return nil, err
   160  			}
   161  			content = contentBuf.Bytes()
   162  			err = discardFull(batchReader, discard)
   163  			if err != nil {
   164  				return nil, err
   165  			}
   166  		}
   167  		if !notGenerated && enry.IsGenerated(f.Name(), content) {
   168  			continue
   169  		}
   170  
   171  		// FIXME: Why can't we split this and the IsGenerated tests to avoid reading the blob unless absolutely necessary?
   172  		// - eg. do the all the detection tests using filename first before reading content.
   173  		language := analyze.GetCodeLanguage(f.Name(), content)
   174  		if language == enry.OtherLanguage || language == "" {
   175  			continue
   176  		}
   177  
   178  		// group languages, such as Pug -> HTML; SCSS -> CSS
   179  		group := enry.GetLanguageGroup(language)
   180  		if group != "" {
   181  			language = group
   182  		}
   183  
   184  		sizes[language] += f.Size()
   185  		continue
   186  	}
   187  
   188  	// filter special languages unless they are the only language
   189  	if len(sizes) > 1 {
   190  		for language := range sizes {
   191  			langtype := enry.GetLanguageType(language)
   192  			if langtype != enry.Programming && langtype != enry.Markup {
   193  				delete(sizes, language)
   194  			}
   195  		}
   196  	}
   197  
   198  	return sizes, nil
   199  }
   200  
   201  func discardFull(rd *bufio.Reader, discard int64) error {
   202  	if discard > math.MaxInt32 {
   203  		n, err := rd.Discard(math.MaxInt32)
   204  		discard -= int64(n)
   205  		if err != nil {
   206  			return err
   207  		}
   208  	}
   209  	for discard > 0 {
   210  		n, err := rd.Discard(int(discard))
   211  		discard -= int64(n)
   212  		if err != nil {
   213  			return err
   214  		}
   215  	}
   216  	return nil
   217  }