github.com/replit/upm@v0.0.0-20240423230255-9ce4fc3ea24c/internal/backends/python/gen_pypi_map/db_gen.go (about) 1 package main 2 3 import ( 4 "database/sql" 5 "encoding/json" 6 "errors" 7 "fmt" 8 "os" 9 "strings" 10 11 _ "github.com/mattn/go-sqlite3" 12 ) 13 14 func GenerateDB(pkg string, outputFilePath string, cache map[string]PackageInfo, bqFilePath string, pkgsLegacyFile string) error { 15 downloadStats, err := LoadDownloadStats(bqFilePath) 16 if err != nil { 17 return err 18 } 19 20 legacyPypiPackages := loadLegacyPypyPackages(pkgsLegacyFile) 21 22 packagesProcessed := make(map[string]bool) 23 var moduleToPackageList = map[string][]PackageInfo{} 24 25 for _, info := range cache { 26 pkgName := strings.ToLower(info.Name) 27 if info.Error != "" { 28 // fallback to legacy package module info 29 legacyInfo, ok := legacyPypiPackages[pkgName] 30 if ok { 31 info.Modules = legacyInfo.Mods 32 } 33 } 34 packagesProcessed[pkgName] = true 35 for _, module := range info.Modules { 36 moduleToPackageList[module] = append(moduleToPackageList[module], info) 37 } 38 } 39 40 // Backfill legacy package info that is missing from our cache 41 for pkg, legacyInfo := range legacyPypiPackages { 42 _, ok := packagesProcessed[pkg] 43 if ok { 44 continue 45 } 46 var info PackageInfo 47 info.Name = legacyInfo.Pkg 48 info.Modules = legacyInfo.Mods 49 50 for _, module := range info.Modules { 51 moduleToPackageList[module] = append(moduleToPackageList[module], info) 52 } 53 } 54 55 fmt.Printf("Loaded %d modules\n", len(moduleToPackageList)) 56 57 err = os.Remove(outputFilePath) 58 if err != nil && !errors.Is(err, os.ErrNotExist) { 59 return err 60 } 61 db, err := sql.Open("sqlite3", outputFilePath) 62 if err != nil { 63 return err 64 } 65 _, err = db.Exec(` 66 create table module_to_pypi_package (module_name text primary key, guess text, reason text); 67 create table pypi_packages (package_name text primary key, module_list text); 68 `) 69 if err != nil { 70 return err 71 } 72 73 // Write all data within one transaction for speed 74 // https://stackoverflow.com/questions/1711631/improve-insert-per-second-performance-of-sqlite 75 _, err = db.Exec(`begin transaction;`) 76 if err != nil { 77 return err 78 } 79 80 // Guess at every module, add the guess and the package that was guessed to 81 // the map 82 for moduleName, candidates := range moduleToPackageList { 83 if guess, reason, guessable := GuessPackage(moduleName, candidates, downloadStats); guessable { 84 stmt, err := db.Prepare("insert into module_to_pypi_package values (?, ?, ?);") 85 if err != nil { 86 return err 87 } 88 _, err = stmt.Exec(moduleName, guess.Name, reason) 89 if err != nil { 90 return err 91 } 92 stmt.Close() 93 94 stmt, err = db.Prepare(` 95 insert into pypi_packages values (?, ?) 96 on conflict (package_name) 97 do update set 98 module_list = excluded.module_list; 99 `) 100 if err != nil { 101 return err 102 } 103 _, err = stmt.Exec(guess.Name, strings.Join(guess.Modules, ",")) 104 if err != nil { 105 return fmt.Errorf("%s on %s", err.Error(), guess.Name) 106 } 107 stmt.Close() 108 } 109 } 110 111 _, err = db.Exec(`end transaction;`) 112 if err != nil { 113 return err 114 } 115 116 err = db.Close() 117 if err != nil { 118 return err 119 } 120 121 // Make it read only 122 err = os.Chmod(outputFilePath, 0444) 123 if err != nil { 124 return err 125 } 126 127 fmt.Printf("Wrote %s\n", outputFilePath) 128 return nil 129 130 } 131 132 func loadLegacyPypyPackages(filePath string) map[string]LegacyPackageInfo { 133 injson, err := os.Open(filePath) 134 if err != nil { 135 return make(map[string]LegacyPackageInfo) 136 } 137 infoMap := make(map[string]LegacyPackageInfo) 138 139 dec := json.NewDecoder(injson) 140 for dec.More() { 141 var info LegacyPackageInfo 142 143 err = dec.Decode(&info) 144 if err != nil { 145 continue 146 } 147 info.Pkg = strings.ToLower(info.Pkg) 148 infoMap[info.Pkg] = info 149 } 150 151 return infoMap 152 }