kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/go/platform/kzip/info/info.go (about)

     1  /*
     2   * Copyright 2019 The Kythe Authors. All rights reserved.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *   http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  // Package info provides utilities for summarizing the contents of a kzip.
    18  package info // import "kythe.io/kythe/go/platform/kzip/info"
    19  
    20  import (
    21  	"fmt"
    22  	"path/filepath"
    23  	"strconv"
    24  	"strings"
    25  
    26  	"kythe.io/kythe/go/platform/kzip"
    27  	"kythe.io/kythe/go/util/log"
    28  
    29  	"bitbucket.org/creachadair/stringset"
    30  	"google.golang.org/protobuf/encoding/prototext"
    31  
    32  	apb "kythe.io/kythe/proto/analysis_go_proto"
    33  	spb "kythe.io/kythe/proto/storage_go_proto"
    34  )
    35  
    36  // KzipInfo scans the kzip in f and counts contained files and units, giving a
    37  // breakdown by corpus and language. It also records the size (in bytes) of the
    38  // kzip specified by fileSize in the returned KzipInfo. This is a convenience
    39  // method and thin wrapper over the Accumulator. If you need to do more than
    40  // just calculate KzipInfo while doing a kzip.Scan(), you should use the
    41  // Accumulator directly.
    42  func KzipInfo(f kzip.File, fileSize int64, scanOpts ...kzip.ScanOption) (*apb.KzipInfo, error) {
    43  	a := NewAccumulator(fileSize)
    44  	if err := kzip.Scan(f, func(r *kzip.Reader, unit *kzip.Unit) error {
    45  		a.Accumulate(unit)
    46  		return nil
    47  	}, scanOpts...); err != nil {
    48  		return nil, fmt.Errorf("scanning kzip: %v", err)
    49  	}
    50  	return a.Get(), nil
    51  }
    52  
    53  // Accumulator is used to build a summary of a collection of compilation units.
    54  // Usage:
    55  //
    56  //	a := NewAccumulator(fileSize)
    57  //	a.Accumulate(unit) // call for each compilation unit
    58  //	info := a.Get()    // get the resulting KzipInfo
    59  type Accumulator struct {
    60  	KzipInfo *apb.KzipInfo
    61  }
    62  
    63  // NewAccumulator creates a new Accumulator instance given the kzip fileSize (in
    64  // bytes).
    65  func NewAccumulator(fileSize int64) *Accumulator {
    66  	return &Accumulator{
    67  		KzipInfo: &apb.KzipInfo{
    68  			Corpora: make(map[string]*apb.KzipInfo_CorpusInfo),
    69  			Size:    fileSize,
    70  		},
    71  	}
    72  }
    73  
    74  // Accumulate should be called for each unit in the kzip so its counts can be
    75  // recorded.
    76  func (a *Accumulator) Accumulate(u *kzip.Unit) {
    77  	// Set of canonicalized source file paths in the kzip
    78  	srcs := stringset.New()
    79  	for _, p := range u.Proto.SourceFile {
    80  		srcs.Add(filepath.Clean(p))
    81  	}
    82  
    83  	cuLang := u.Proto.GetVName().GetLanguage()
    84  	if cuLang == "" {
    85  		msg := fmt.Sprintf("CU(%s) does not specify a language", formatVName(u.Proto.GetVName()))
    86  		a.KzipInfo.CriticalKzipErrors = append(a.KzipInfo.CriticalKzipErrors, msg)
    87  		return
    88  	}
    89  
    90  	cuInfo(u.Proto.GetVName().GetCorpus(), cuLang, a.KzipInfo).Count++
    91  
    92  	if cuLang == "java" {
    93  		v := javaSourceVersion(u.Proto.GetArgument())
    94  		cuInfo(u.Proto.GetVName().GetCorpus(), cuLang, a.KzipInfo).JavaVersionCount[int32(v)]++
    95  	}
    96  
    97  	var srcCorpora stringset.Set
    98  	var absPaths stringset.Set
    99  	srcsWithRI := stringset.New()
   100  	for _, ri := range u.Proto.RequiredInput {
   101  		if strings.HasPrefix(ri.GetVName().GetPath(), "/") && !strings.HasPrefix(ri.GetVName().GetPath(), "/kythe_builtins/") {
   102  			absPaths.Add(ri.GetVName().GetPath())
   103  		}
   104  
   105  		riCorpus := requiredInputCorpus(u, ri)
   106  		if riCorpus == "" {
   107  			// Trim spaces to work around the fact that log("%v", proto) is inconsistent about trailing spaces in google3 vs open-source go.
   108  			msg := strings.TrimSpace(fmt.Sprintf("unable to determine corpus for required_input %q in CU(%s)", ri.Info.Path, formatVName(u.Proto.GetVName())))
   109  			a.KzipInfo.CriticalKzipErrors = append(a.KzipInfo.CriticalKzipErrors, msg)
   110  			return
   111  		}
   112  		requiredInputInfo(riCorpus, cuLang, a.KzipInfo).Count++
   113  		// canonicalize required_input path before checking against source
   114  		// files. In some cases, required_input paths may be non-canonical due
   115  		// to compiler idiosyncrasies (ahem c++), but it's ok to canonicalize
   116  		// for the purposes of this validation check.
   117  		normalizedInputPath := filepath.Clean(ri.Info.Path)
   118  		if srcs.Contains(normalizedInputPath) {
   119  			sourceInfo(riCorpus, cuLang, a.KzipInfo).Count++
   120  			srcCorpora.Add(riCorpus)
   121  			srcsWithRI.Add(normalizedInputPath)
   122  		}
   123  	}
   124  	srcsWithoutRI := srcs.Diff(srcsWithRI)
   125  	for path := range srcsWithoutRI {
   126  		msg := fmt.Sprintf("source %q in CU(%s) doesn't have a required_input entry", path, formatVName(u.Proto.GetVName()))
   127  		a.KzipInfo.CriticalKzipErrors = append(a.KzipInfo.CriticalKzipErrors, msg)
   128  	}
   129  	if srcCorpora.Len() != 1 {
   130  		// This is a warning for now, but may become an error.
   131  		log.Infof("Multiple corpora in unit. unit vname={%v}; src corpora=%v; srcs=%v", u.Proto.GetVName(), srcCorpora, u.Proto.SourceFile)
   132  	}
   133  
   134  	a.KzipInfo.AbsolutePaths = absPaths.Elements()
   135  }
   136  
   137  // javaSourceVersion checks if the CU args include setting source to a language level and if it does
   138  // it returns that version. If no source flag is set or it can't be read, it returns 0.
   139  func javaSourceVersion(args []string) int {
   140  	for i, arg := range args {
   141  		if arg == "-source" || arg == "--source" {
   142  			if i+1 < len(args) {
   143  				version := args[i+1]
   144  				version = strings.TrimPrefix(version, "1.")
   145  				intVersion, err := strconv.Atoi(version)
   146  				if err != nil {
   147  					log.Errorf("Unable to parse java version string: %v", err)
   148  					return 0
   149  				}
   150  				return intVersion
   151  			}
   152  		}
   153  	}
   154  	return 0
   155  }
   156  
   157  // Get returns the final KzipInfo after info from each unit in the kzip has been
   158  // accumulated.
   159  func (a *Accumulator) Get() *apb.KzipInfo {
   160  	return a.KzipInfo
   161  }
   162  
   163  // requiredInputCorpus computes the corpus for a required input. It follows the rules in the
   164  // CompilationUnit proto comments in kythe/proto/analysis.proto that say that any
   165  // required_input that does not set corpus in its VName should inherit corpus from the compilation
   166  // unit's VName.
   167  func requiredInputCorpus(u *kzip.Unit, ri *apb.CompilationUnit_FileInput) string {
   168  	if c := ri.GetVName().GetCorpus(); c != "" {
   169  		return c
   170  	}
   171  	return u.Proto.GetVName().GetCorpus()
   172  }
   173  
   174  // KzipInfoTotalCount returns the total CompilationUnits counts for infos split apart by language.
   175  func KzipInfoTotalCount(infos []*apb.KzipInfo) *apb.KzipInfo_CorpusInfo {
   176  	totals := &apb.KzipInfo_CorpusInfo{
   177  		LanguageRequiredInputs: make(map[string]*apb.KzipInfo_CorpusInfo_Inputs),
   178  		LanguageSources:        make(map[string]*apb.KzipInfo_CorpusInfo_Inputs),
   179  		LanguageCuInfo:         make(map[string]*apb.KzipInfo_CorpusInfo_CUInfo),
   180  	}
   181  	for _, info := range infos {
   182  		for _, i := range info.GetCorpora() {
   183  			for lang, stats := range i.GetLanguageRequiredInputs() {
   184  				total := totals.LanguageRequiredInputs[lang]
   185  				if total == nil {
   186  					total = &apb.KzipInfo_CorpusInfo_Inputs{}
   187  					totals.LanguageRequiredInputs[lang] = total
   188  				}
   189  				total.Count += stats.GetCount()
   190  			}
   191  			for lang, stats := range i.GetLanguageSources() {
   192  				total := totals.LanguageSources[lang]
   193  				if total == nil {
   194  					total = &apb.KzipInfo_CorpusInfo_Inputs{}
   195  					totals.LanguageSources[lang] = total
   196  				}
   197  				total.Count += stats.GetCount()
   198  			}
   199  			for lang, stats := range i.GetLanguageCuInfo() {
   200  				total := totals.LanguageCuInfo[lang]
   201  				if total == nil {
   202  					total = makeCUInfo()
   203  					totals.LanguageCuInfo[lang] = total
   204  				}
   205  				total.Count += stats.GetCount()
   206  				for version, count := range stats.GetJavaVersionCount() {
   207  					total.JavaVersionCount[version] += count
   208  				}
   209  			}
   210  		}
   211  	}
   212  	return totals
   213  }
   214  
   215  // MergeKzipInfo combines the counts from multiple KzipInfos.
   216  func MergeKzipInfo(infos []*apb.KzipInfo) *apb.KzipInfo {
   217  	kzipInfo := &apb.KzipInfo{Corpora: make(map[string]*apb.KzipInfo_CorpusInfo)}
   218  
   219  	for _, i := range infos {
   220  		for corpus, cinfo := range i.GetCorpora() {
   221  			for lang, inputs := range cinfo.GetLanguageRequiredInputs() {
   222  				c := requiredInputInfo(corpus, lang, kzipInfo)
   223  				c.Count += inputs.GetCount()
   224  			}
   225  			for lang, sources := range cinfo.GetLanguageSources() {
   226  				c := sourceInfo(corpus, lang, kzipInfo)
   227  				c.Count += sources.GetCount()
   228  			}
   229  			for lang, cu := range cinfo.GetLanguageCuInfo() {
   230  				c := cuInfo(corpus, lang, kzipInfo)
   231  				c.Count += cu.GetCount()
   232  				for version, count := range cu.GetJavaVersionCount() {
   233  					c.JavaVersionCount[version] += count
   234  				}
   235  			}
   236  		}
   237  		kzipInfo.CriticalKzipErrors = append(kzipInfo.GetCriticalKzipErrors(), i.GetCriticalKzipErrors()...)
   238  		kzipInfo.Size += i.Size
   239  		kzipInfo.AbsolutePaths = stringset.New(kzipInfo.AbsolutePaths...).Union(stringset.New(i.AbsolutePaths...)).Elements()
   240  	}
   241  	return kzipInfo
   242  }
   243  
   244  func requiredInputInfo(corpus, lang string, kzipInfo *apb.KzipInfo) *apb.KzipInfo_CorpusInfo_Inputs {
   245  	c := corpusInfo(corpus, kzipInfo)
   246  	lri := c.LanguageRequiredInputs[lang]
   247  	if lri == nil {
   248  		lri = &apb.KzipInfo_CorpusInfo_Inputs{}
   249  		c.LanguageRequiredInputs[lang] = lri
   250  	}
   251  	return lri
   252  }
   253  
   254  func sourceInfo(corpus, lang string, kzipInfo *apb.KzipInfo) *apb.KzipInfo_CorpusInfo_Inputs {
   255  	c := corpusInfo(corpus, kzipInfo)
   256  	ls := c.LanguageSources[lang]
   257  	if ls == nil {
   258  		ls = &apb.KzipInfo_CorpusInfo_Inputs{}
   259  		c.LanguageSources[lang] = ls
   260  	}
   261  	return ls
   262  }
   263  
   264  func cuInfo(corpus, lang string, kzipInfo *apb.KzipInfo) *apb.KzipInfo_CorpusInfo_CUInfo {
   265  	c := corpusInfo(corpus, kzipInfo)
   266  	cuInfo := c.LanguageCuInfo[lang]
   267  	if cuInfo == nil {
   268  		cuInfo = makeCUInfo()
   269  		c.LanguageCuInfo[lang] = cuInfo
   270  	}
   271  	return cuInfo
   272  }
   273  
   274  func makeCUInfo() *apb.KzipInfo_CorpusInfo_CUInfo {
   275  	return &apb.KzipInfo_CorpusInfo_CUInfo{
   276  		JavaVersionCount: make(map[int32]int32),
   277  	}
   278  }
   279  
   280  func corpusInfo(corpus string, kzipInfo *apb.KzipInfo) *apb.KzipInfo_CorpusInfo {
   281  	i := kzipInfo.GetCorpora()[corpus]
   282  	if i == nil {
   283  		i = &apb.KzipInfo_CorpusInfo{
   284  			LanguageRequiredInputs: make(map[string]*apb.KzipInfo_CorpusInfo_Inputs),
   285  			LanguageSources:        make(map[string]*apb.KzipInfo_CorpusInfo_Inputs),
   286  			LanguageCuInfo:         make(map[string]*apb.KzipInfo_CorpusInfo_CUInfo),
   287  		}
   288  		kzipInfo.Corpora[corpus] = i
   289  	}
   290  	return i
   291  }
   292  
   293  func formatVName(v *spb.VName) string {
   294  	return strings.ReplaceAll(prototext.MarshalOptions{}.Format(v), "  ", " ")
   295  }