github.com/replit/upm@v0.0.0-20240423230255-9ce4fc3ea24c/internal/backends/python/gen_pypi_map/gen_pypi_map.go (about) 1 package main 2 3 import ( 4 "bufio" 5 "encoding/json" 6 "flag" 7 "fmt" 8 "os" 9 "sort" 10 "strings" 11 "time" 12 ) 13 14 /* 15 16 This CLI program provides the following commands, intended to be executed in order: 17 18 * bq - fetch pypi download stats 19 * test - test modules on pypi and save the results (1 file per package) in the cache directory 20 * updatepkgs - read from the cache directory and update the pkgs.json file 21 * gen - read pkgs.json and generate pypi_map.sqlite file, containing mappings for package guessing 22 23 Additionally, 24 * test-one - run `test` for a single package 25 */ 26 27 func cmd_bq(args []string) { 28 /* 29 Fetch download stats from pypi's public big query table 30 Parameters: gcp, bq 31 */ 32 bqCommandSet := flag.NewFlagSet("bq-flags", flag.ExitOnError) 33 bqGCP := bqCommandSet.String("gcp", "", "A GCP project ID to use to query bigquery directly.") 34 bqBQ := bqCommandSet.String("bq", "download_stats.json", "The result of a BigQuery against the pypi downloads dataset.") 35 if err := bqCommandSet.Parse(args); err != nil { 36 fmt.Fprintf(os.Stderr, "Failed to parse bq flags: %s\n", err) 37 return 38 } 39 if *bqGCP == "" { 40 fmt.Fprintln(os.Stderr, "Error: The 'gcp' flag must not be empty.") 41 return 42 } 43 err := FetchBQDownloads(*bqGCP, *bqBQ) 44 if err != nil { 45 fmt.Fprintf(os.Stderr, "Failed to fetch BQ download stats: %s\n", err.Error()) 46 return 47 } 48 } 49 50 func cmd_test(args []string) { 51 /* 52 Test packages to find out the list of modules each one provides 53 Parameters: cache, index, workers, distMods 54 */ 55 56 testCommandSet := flag.NewFlagSet("test-flags", flag.ExitOnError) 57 testCache := testCommandSet.String("cache", "cache", "A directory where to store temporary cached information for each module.") 58 testIndex := testCommandSet.String("index", "", "A json index file for packages containing an array of strings") 59 testWorkers := testCommandSet.Int("workers", 16, "The number of simultaneous workers to run") 60 testDistMods := testCommandSet.Bool("distMods", false, "Determine modules by examining dists") 61 testBQ := testCommandSet.String("bq", "download_stats.json", "The result of a BigQuery against the pypi downloads dataset.") 62 testForce := testCommandSet.Bool("force", false, "Force re-test when cached") 63 testPkgsFile := testCommandSet.String("pkgsfile", "pkgs.json", "A file where to store permanent information for each module.") 64 testRemapFile := testCommandSet.String("remapfile", "remap.csv", "A file containing alterations for when a popular package name should be replaced with a newer version") 65 testThreshold := testCommandSet.Int("threshold", 10000, "Only process packages with at least this many downloads") 66 testTimeout := testCommandSet.Int("timeout", 60, "The maximum number of seconds to wait for a package to install.") 67 if err := testCommandSet.Parse(args); err != nil { 68 fmt.Fprintf(os.Stderr, "Failed to parse test flags: %s\n", err) 69 return 70 } 71 72 var packages PackageIndex 73 if testThreshold != nil { 74 fmt.Printf("Loading pypi stats from cache file\n") 75 bqCache, err := LoadDownloadStats(*testBQ) 76 if err != nil { 77 fmt.Fprintf(os.Stderr, "Failed to load data from big query file %s: %v\n", *testBQ, err) 78 return 79 } 80 fmt.Printf("Loaded %v stats\n", len(bqCache)) 81 normalizedBqCache := make(map[string]int) 82 83 for name, count := range bqCache { 84 normalizedBqCache[normalizePackageName(name)] = count 85 } 86 bqCache = normalizedBqCache 87 88 packageRemaps := make(map[string]string) 89 file, err := os.Open(*testRemapFile) 90 91 if err == nil { 92 scanner := bufio.NewScanner(file) 93 for scanner.Scan() { 94 columns := strings.SplitN(scanner.Text(), ",", 3) 95 if len(columns) > 0 { 96 old := columns[0] 97 new := columns[1] 98 // description := columns[2] 99 packageRemaps[old] = new 100 } 101 } 102 103 if err := scanner.Err(); err != nil { 104 panic(err) 105 } 106 107 file.Close() 108 } 109 110 // Deduplicate results 111 packageMap := make(map[string]bool) 112 for pkgName, count := range bqCache { 113 if count < *testThreshold { 114 continue 115 } 116 // Apply package rename 117 if newName, ok := packageRemaps[pkgName]; ok { 118 pkgName = newName 119 } 120 packageMap[pkgName] = true 121 } 122 123 packageList := []string{} 124 for pkgName := range packageMap { 125 packageList = append(packageList, pkgName) 126 } 127 fmt.Printf("Preparing to process %v packages\n", len(packageList)) 128 packages = FakePackageIndex(packageList...) 129 } else if *testIndex != "" { 130 file, err := os.Open(*testIndex) 131 if err != nil { 132 fmt.Fprintf(os.Stderr, "Failed to open file %s: %s\n", *testIndex, err.Error()) 133 return 134 } 135 var packageList []string 136 decoder := json.NewDecoder(file) 137 err = decoder.Decode(&packageList) 138 if err != nil { 139 fmt.Fprintf(os.Stderr, "Failed to decode file %s: %s\n", *testIndex, err.Error()) 140 return 141 } 142 defer file.Close() 143 packages = FakePackageIndex(packageList...) 144 } else { 145 packages, _ = NewPackageIndex("https://pypi.org/simple/", -1) 146 } 147 TestModules(packages, *testCache, *testPkgsFile, *testDistMods, *testWorkers, *testForce, time.Duration(*testTimeout)*time.Second) 148 } 149 150 func cmd_test_one(args []string) { 151 /* 152 Test a single package to find the list of modules provided 153 */ 154 155 testOneCommandSet := flag.NewFlagSet("test-one-flags", flag.ExitOnError) 156 testOnePackage := testOneCommandSet.String("package", "", "Which package to test") 157 testOneCache := testOneCommandSet.String("cache", "cache", "A directory where to store temporary cached information for each module.") 158 testOneDistMods := testOneCommandSet.Bool("distMods", false, "Determine modules by examining dists") 159 testOneForce := testOneCommandSet.Bool("force", false, "Force re-test when cached") 160 testOnePkgsFile := testOneCommandSet.String("pkgsfile", "pkgs.json", "A file where to store permanent information for each module.") 161 testOneTimeout := testOneCommandSet.Int("timeout", 60, "The maximum number of seconds to wait for a package to install.") 162 if err := testOneCommandSet.Parse(args); err != nil { 163 fmt.Fprintf(os.Stderr, "Failed to parse test flags: %s\n", err) 164 return 165 } 166 if *testOnePackage == "" { 167 fmt.Fprintf(os.Stderr, "Missing -package flag, cannot continue\n") 168 return 169 } 170 171 cache := LoadAllPackageInfo(*testOneCache, *testOnePkgsFile) 172 info, err := ProcessPackage(*testOnePackage, cache, *testOneCache, *testOneDistMods, *testOneForce, time.Duration(*testOneTimeout)*time.Second) 173 if err != nil { 174 fmt.Fprintf(os.Stderr, "Error processing package: %v\n", err) 175 return 176 } 177 178 fmt.Printf("Name: %s\n", info.Name) 179 fmt.Printf("Modules: %s\n", strings.Join(info.Modules, ", ")) 180 } 181 182 func cmd_gen(args []string) { 183 /* 184 Generate source file that provides pypi mappings 185 Parameters: pkg, out, cachedfr, cachefile, bq, pypipackages 186 */ 187 genCommandSet := flag.NewFlagSet("gen-flags", flag.ExitOnError) 188 genPkg := genCommandSet.String("pkg", "python", "the pkg name for the output source") 189 genOut := genCommandSet.String("out", "pypi_map.sqlite", "the destination file for the generated data") 190 genCache := genCommandSet.String("cache", "cache", "A directory where to store temporary cached information for each module.") 191 genPkgsFile := genCommandSet.String("pkgsfile", "pkgs.json", "A file where to store permanent information for each module.") 192 genPkgsLegacyFile := genCommandSet.String("legacypkgsfile", "pypi_packages.json", "Legacy dependencies information for each module - used as a fallback") 193 genBQ := genCommandSet.String("bq", "download_stats.json", "The result of a BigQuery against the pypi downloads dataset.") 194 if err := genCommandSet.Parse(args); err != nil { 195 fmt.Fprintf(os.Stderr, "Failed to parse gen flags: %s\n", err) 196 return 197 } 198 199 cache := LoadAllPackageInfo(*genCache, *genPkgsFile) 200 err := GenerateDB(*genPkg, *genOut, cache, *genBQ, *genPkgsLegacyFile) 201 if err != nil { 202 fmt.Fprintf(os.Stderr, "Failed to generate %s: %s\n", *genOut, err.Error()) 203 } 204 } 205 206 func cmd_updatepkgs(args []string) { 207 /* 208 Update the pkgs.json file with the latest package information 209 Parameters: cache, pkgsfile 210 */ 211 updateCommandSet := flag.NewFlagSet("update-flags", flag.ExitOnError) 212 updateCache := updateCommandSet.String("cache", "cache", "A directory where to store temporary cached information for each module.") 213 updatePkgsFile := updateCommandSet.String("pkgsfile", "pkgs.json", "A file where to store permanent information for each module.") 214 if err := updateCommandSet.Parse(args); err != nil { 215 fmt.Fprintf(os.Stderr, "Failed to parse update flags: %s\n", err) 216 return 217 } 218 err := UpdateAllPackageInfo(*updateCache, *updatePkgsFile) 219 if err != nil { 220 fmt.Fprintf(os.Stderr, "Failed update cache: %s\n", err.Error()) 221 } 222 } 223 224 func main() { 225 command := "" 226 if len(os.Args) > 1 { 227 command = os.Args[1] 228 } 229 validCmds := map[string]func([]string){ 230 "bq": cmd_bq, 231 "test": cmd_test, 232 "test-one": cmd_test_one, 233 "updatepkgs": cmd_updatepkgs, 234 "gen": cmd_gen, 235 } 236 if cmd, ok := validCmds[command]; ok { 237 cmd(os.Args[2:]) 238 } else { 239 var msg string 240 if command != "" { 241 msg = fmt.Sprintf("Invalid command '%s'.", command) 242 } else { 243 msg = "No command provided." 244 } 245 choices := make([]string, 0, len(validCmds)) 246 for cmd := range validCmds { 247 choices = append(choices, cmd) 248 } 249 sort.Strings(choices) 250 fmt.Fprintf(os.Stderr, "Error: %s\nValid commands are %s.\n", msg, strings.Join(choices, ", ")) 251 } 252 }