github.com/replit/upm@v0.0.0-20240423230255-9ce4fc3ea24c/internal/backends/python/gen_pypi_map/gen_pypi_map.go (about)

     1  package main
     2  
     3  import (
     4  	"bufio"
     5  	"encoding/json"
     6  	"flag"
     7  	"fmt"
     8  	"os"
     9  	"sort"
    10  	"strings"
    11  	"time"
    12  )
    13  
    14  /*
    15  
    16  This CLI program provides the following commands, intended to be executed in order:
    17  
    18  * bq         - fetch pypi download stats
    19  * test       - test modules on pypi and save the results (1 file per package) in the cache directory
    20  * updatepkgs - read from the cache directory and update the pkgs.json file
    21  * gen        - read pkgs.json and generate pypi_map.sqlite file, containing mappings for package guessing
    22  
    23  Additionally,
    24  * test-one - run `test` for a single package
    25  */
    26  
    27  func cmd_bq(args []string) {
    28  	/*
    29  	   Fetch download stats from pypi's public big query table
    30  	   Parameters: gcp, bq
    31  	*/
    32  	bqCommandSet := flag.NewFlagSet("bq-flags", flag.ExitOnError)
    33  	bqGCP := bqCommandSet.String("gcp", "", "A GCP project ID to use to query bigquery directly.")
    34  	bqBQ := bqCommandSet.String("bq", "download_stats.json", "The result of a BigQuery against the pypi downloads dataset.")
    35  	if err := bqCommandSet.Parse(args); err != nil {
    36  		fmt.Fprintf(os.Stderr, "Failed to parse bq flags: %s\n", err)
    37  		return
    38  	}
    39  	if *bqGCP == "" {
    40  		fmt.Fprintln(os.Stderr, "Error: The 'gcp' flag must not be empty.")
    41  		return
    42  	}
    43  	err := FetchBQDownloads(*bqGCP, *bqBQ)
    44  	if err != nil {
    45  		fmt.Fprintf(os.Stderr, "Failed to fetch BQ download stats: %s\n", err.Error())
    46  		return
    47  	}
    48  }
    49  
    50  func cmd_test(args []string) {
    51  	/*
    52  		Test packages to find out the list of modules each one provides
    53  		Parameters: cache, index, workers, distMods
    54  	*/
    55  
    56  	testCommandSet := flag.NewFlagSet("test-flags", flag.ExitOnError)
    57  	testCache := testCommandSet.String("cache", "cache", "A directory where to store temporary cached information for each module.")
    58  	testIndex := testCommandSet.String("index", "", "A json index file for packages containing an array of strings")
    59  	testWorkers := testCommandSet.Int("workers", 16, "The number of simultaneous workers to run")
    60  	testDistMods := testCommandSet.Bool("distMods", false, "Determine modules by examining dists")
    61  	testBQ := testCommandSet.String("bq", "download_stats.json", "The result of a BigQuery against the pypi downloads dataset.")
    62  	testForce := testCommandSet.Bool("force", false, "Force re-test when cached")
    63  	testPkgsFile := testCommandSet.String("pkgsfile", "pkgs.json", "A file where to store permanent information for each module.")
    64  	testRemapFile := testCommandSet.String("remapfile", "remap.csv", "A file containing alterations for when a popular package name should be replaced with a newer version")
    65  	testThreshold := testCommandSet.Int("threshold", 10000, "Only process packages with at least this many downloads")
    66  	testTimeout := testCommandSet.Int("timeout", 60, "The maximum number of seconds to wait for a package to install.")
    67  	if err := testCommandSet.Parse(args); err != nil {
    68  		fmt.Fprintf(os.Stderr, "Failed to parse test flags: %s\n", err)
    69  		return
    70  	}
    71  
    72  	var packages PackageIndex
    73  	if testThreshold != nil {
    74  		fmt.Printf("Loading pypi stats from cache file\n")
    75  		bqCache, err := LoadDownloadStats(*testBQ)
    76  		if err != nil {
    77  			fmt.Fprintf(os.Stderr, "Failed to load data from big query file %s: %v\n", *testBQ, err)
    78  			return
    79  		}
    80  		fmt.Printf("Loaded %v stats\n", len(bqCache))
    81  		normalizedBqCache := make(map[string]int)
    82  
    83  		for name, count := range bqCache {
    84  			normalizedBqCache[normalizePackageName(name)] = count
    85  		}
    86  		bqCache = normalizedBqCache
    87  
    88  		packageRemaps := make(map[string]string)
    89  		file, err := os.Open(*testRemapFile)
    90  
    91  		if err == nil {
    92  			scanner := bufio.NewScanner(file)
    93  			for scanner.Scan() {
    94  				columns := strings.SplitN(scanner.Text(), ",", 3)
    95  				if len(columns) > 0 {
    96  					old := columns[0]
    97  					new := columns[1]
    98  					// description := columns[2]
    99  					packageRemaps[old] = new
   100  				}
   101  			}
   102  
   103  			if err := scanner.Err(); err != nil {
   104  				panic(err)
   105  			}
   106  
   107  			file.Close()
   108  		}
   109  
   110  		// Deduplicate results
   111  		packageMap := make(map[string]bool)
   112  		for pkgName, count := range bqCache {
   113  			if count < *testThreshold {
   114  				continue
   115  			}
   116  			// Apply package rename
   117  			if newName, ok := packageRemaps[pkgName]; ok {
   118  				pkgName = newName
   119  			}
   120  			packageMap[pkgName] = true
   121  		}
   122  
   123  		packageList := []string{}
   124  		for pkgName := range packageMap {
   125  			packageList = append(packageList, pkgName)
   126  		}
   127  		fmt.Printf("Preparing to process %v packages\n", len(packageList))
   128  		packages = FakePackageIndex(packageList...)
   129  	} else if *testIndex != "" {
   130  		file, err := os.Open(*testIndex)
   131  		if err != nil {
   132  			fmt.Fprintf(os.Stderr, "Failed to open file %s: %s\n", *testIndex, err.Error())
   133  			return
   134  		}
   135  		var packageList []string
   136  		decoder := json.NewDecoder(file)
   137  		err = decoder.Decode(&packageList)
   138  		if err != nil {
   139  			fmt.Fprintf(os.Stderr, "Failed to decode file %s: %s\n", *testIndex, err.Error())
   140  			return
   141  		}
   142  		defer file.Close()
   143  		packages = FakePackageIndex(packageList...)
   144  	} else {
   145  		packages, _ = NewPackageIndex("https://pypi.org/simple/", -1)
   146  	}
   147  	TestModules(packages, *testCache, *testPkgsFile, *testDistMods, *testWorkers, *testForce, time.Duration(*testTimeout)*time.Second)
   148  }
   149  
   150  func cmd_test_one(args []string) {
   151  	/*
   152  		Test a single package to find the list of modules provided
   153  	*/
   154  
   155  	testOneCommandSet := flag.NewFlagSet("test-one-flags", flag.ExitOnError)
   156  	testOnePackage := testOneCommandSet.String("package", "", "Which package to test")
   157  	testOneCache := testOneCommandSet.String("cache", "cache", "A directory where to store temporary cached information for each module.")
   158  	testOneDistMods := testOneCommandSet.Bool("distMods", false, "Determine modules by examining dists")
   159  	testOneForce := testOneCommandSet.Bool("force", false, "Force re-test when cached")
   160  	testOnePkgsFile := testOneCommandSet.String("pkgsfile", "pkgs.json", "A file where to store permanent information for each module.")
   161  	testOneTimeout := testOneCommandSet.Int("timeout", 60, "The maximum number of seconds to wait for a package to install.")
   162  	if err := testOneCommandSet.Parse(args); err != nil {
   163  		fmt.Fprintf(os.Stderr, "Failed to parse test flags: %s\n", err)
   164  		return
   165  	}
   166  	if *testOnePackage == "" {
   167  		fmt.Fprintf(os.Stderr, "Missing -package flag, cannot continue\n")
   168  		return
   169  	}
   170  
   171  	cache := LoadAllPackageInfo(*testOneCache, *testOnePkgsFile)
   172  	info, err := ProcessPackage(*testOnePackage, cache, *testOneCache, *testOneDistMods, *testOneForce, time.Duration(*testOneTimeout)*time.Second)
   173  	if err != nil {
   174  		fmt.Fprintf(os.Stderr, "Error processing package: %v\n", err)
   175  		return
   176  	}
   177  
   178  	fmt.Printf("Name: %s\n", info.Name)
   179  	fmt.Printf("Modules: %s\n", strings.Join(info.Modules, ", "))
   180  }
   181  
   182  func cmd_gen(args []string) {
   183  	/*
   184  		Generate source file that provides pypi mappings
   185  		Parameters: pkg, out, cachedfr, cachefile, bq, pypipackages
   186  	*/
   187  	genCommandSet := flag.NewFlagSet("gen-flags", flag.ExitOnError)
   188  	genPkg := genCommandSet.String("pkg", "python", "the pkg name for the output source")
   189  	genOut := genCommandSet.String("out", "pypi_map.sqlite", "the destination file for the generated data")
   190  	genCache := genCommandSet.String("cache", "cache", "A directory where to store temporary cached information for each module.")
   191  	genPkgsFile := genCommandSet.String("pkgsfile", "pkgs.json", "A file where to store permanent information for each module.")
   192  	genPkgsLegacyFile := genCommandSet.String("legacypkgsfile", "pypi_packages.json", "Legacy dependencies information for each module - used as a fallback")
   193  	genBQ := genCommandSet.String("bq", "download_stats.json", "The result of a BigQuery against the pypi downloads dataset.")
   194  	if err := genCommandSet.Parse(args); err != nil {
   195  		fmt.Fprintf(os.Stderr, "Failed to parse gen flags: %s\n", err)
   196  		return
   197  	}
   198  
   199  	cache := LoadAllPackageInfo(*genCache, *genPkgsFile)
   200  	err := GenerateDB(*genPkg, *genOut, cache, *genBQ, *genPkgsLegacyFile)
   201  	if err != nil {
   202  		fmt.Fprintf(os.Stderr, "Failed to generate %s: %s\n", *genOut, err.Error())
   203  	}
   204  }
   205  
   206  func cmd_updatepkgs(args []string) {
   207  	/*
   208  		Update the pkgs.json file with the latest package information
   209  		Parameters: cache, pkgsfile
   210  	*/
   211  	updateCommandSet := flag.NewFlagSet("update-flags", flag.ExitOnError)
   212  	updateCache := updateCommandSet.String("cache", "cache", "A directory where to store temporary cached information for each module.")
   213  	updatePkgsFile := updateCommandSet.String("pkgsfile", "pkgs.json", "A file where to store permanent information for each module.")
   214  	if err := updateCommandSet.Parse(args); err != nil {
   215  		fmt.Fprintf(os.Stderr, "Failed to parse update flags: %s\n", err)
   216  		return
   217  	}
   218  	err := UpdateAllPackageInfo(*updateCache, *updatePkgsFile)
   219  	if err != nil {
   220  		fmt.Fprintf(os.Stderr, "Failed update cache: %s\n", err.Error())
   221  	}
   222  }
   223  
   224  func main() {
   225  	command := ""
   226  	if len(os.Args) > 1 {
   227  		command = os.Args[1]
   228  	}
   229  	validCmds := map[string]func([]string){
   230  		"bq":         cmd_bq,
   231  		"test":       cmd_test,
   232  		"test-one":   cmd_test_one,
   233  		"updatepkgs": cmd_updatepkgs,
   234  		"gen":        cmd_gen,
   235  	}
   236  	if cmd, ok := validCmds[command]; ok {
   237  		cmd(os.Args[2:])
   238  	} else {
   239  		var msg string
   240  		if command != "" {
   241  			msg = fmt.Sprintf("Invalid command '%s'.", command)
   242  		} else {
   243  			msg = "No command provided."
   244  		}
   245  		choices := make([]string, 0, len(validCmds))
   246  		for cmd := range validCmds {
   247  			choices = append(choices, cmd)
   248  		}
   249  		sort.Strings(choices)
   250  		fmt.Fprintf(os.Stderr, "Error: %s\nValid commands are %s.\n", msg, strings.Join(choices, ", "))
   251  	}
   252  }