github.com/bcampbell/scrapeomat@v0.0.0-20220820232205-23e64141c89e/cmd/loadtool/main.go (about) 1 package main 2 3 // load dumped articles into scrapeomat db. 4 // work in progress - fix as required ;-) 5 6 import ( 7 "flag" 8 "fmt" 9 "os" 10 "strings" 11 //"time" 12 "path/filepath" 13 14 "github.com/bcampbell/scrapeomat/store" 15 "github.com/bcampbell/scrapeomat/store/sqlstore" 16 _ "github.com/lib/pq" 17 _ "github.com/mattn/go-sqlite3" 18 ) 19 20 type Art struct { 21 store.Article 22 // some convenience fields 23 URL string `json:"url,omitempty"` 24 Byline string `json:"byline,omitempty"` 25 Pubcode string `json:"pubcode,omitempty"` 26 } 27 28 // article stream from a slurp API has each article in own object: 29 // {article: {...}} 30 // {article: {...}} 31 type WireFmt struct { 32 Art `json:"article,omitempty"` 33 } 34 35 var opts struct { 36 driver string 37 connStr string 38 pubCode string 39 ignoreLoadErrors bool 40 htmlEscape bool 41 recursive bool 42 forceUpdate bool 43 } 44 45 const usageTxt = `usage: loadtool [options] [file(s)]> 46 47 Imports articles from json files into a scrapeomat db. 48 Input json format is same as slurp API output. 49 ` 50 51 func main() { 52 53 flag.Usage = func() { 54 fmt.Fprintf(os.Stderr, usageTxt) 55 flag.PrintDefaults() 56 os.Exit(2) 57 } 58 59 // flag.BoolVar(&opts.ignoreLoadErrors, "i", false, "ignore load errors - skip failed art and continue") 60 flag.BoolVar(&opts.recursive, "r", false, "Recursive - descend into dirs to find json files.") 61 flag.StringVar(&opts.connStr, "db", "", "database connection string (or set SCRAPEOMAT_DB") 62 flag.StringVar(&opts.driver, "driver", "", "database driver name (defaults to sqlite3 if SCRAPEOMAT_DRIVER is unset)") 63 flag.BoolVar(&opts.forceUpdate, "f", false, "force update of articles already in db") 64 flag.StringVar(&opts.pubCode, "pubcode", "", "publication shortcode (if not in article data)") 65 flag.BoolVar(&opts.htmlEscape, "e", false, "HTML-escape plain text content field") 66 flag.Parse() 67 68 if flag.NArg() < 1 { 69 fmt.Fprintf(os.Stderr, "ERROR: no input files\n") 70 os.Exit(1) 71 } 72 73 jsonFiles, err := collectFiles(flag.Args(), opts.recursive) 74 if err != nil { 75 fmt.Fprintf(os.Stderr, "ERROR: %s\n", err) 76 os.Exit(1) 77 } 78 79 db, err := sqlstore.NewWithEnv(opts.driver, opts.connStr) 80 if err != nil { 81 fmt.Fprintf(os.Stderr, "ERROR opening db: %s\n", err) 82 os.Exit(1) 83 } 84 defer db.Close() 85 86 imp := NewImporter(db) 87 imp.UpdateExisting = opts.forceUpdate 88 89 for _, jsonFile := range jsonFiles { 90 err := imp.ImportJSONFile(jsonFile) 91 if err != nil { 92 fmt.Fprintf(os.Stderr, "ERROR: %s\n", err) 93 os.Exit(1) 94 } 95 } 96 97 } 98 99 // get a list of input files from the commandline args 100 func collectFiles(args []string, recurse bool) ([]string, error) { 101 found := []string{} 102 for _, name := range args { 103 inf, err := os.Stat(name) 104 if err != nil { 105 return nil, err 106 } 107 if inf.IsDir() { 108 if !recurse { 109 return nil, fmt.Errorf("%s is a directory (did you want -r?)", name) 110 } 111 foo, err := findJsonFilesRecursive(name) 112 if err != nil { 113 return nil, err 114 } 115 found = append(found, foo...) 116 } else { 117 found = append(found, name) 118 } 119 } 120 return found, nil 121 } 122 123 // recursively grab list of all json files under rootDir dir 124 func findJsonFilesRecursive(rootDir string) ([]string, error) { 125 files := []string{} 126 err := filepath.Walk(rootDir, func(path string, info os.FileInfo, err error) error { 127 if err != nil { 128 return err 129 } 130 131 if info.IsDir() { 132 return nil 133 } 134 135 if strings.HasSuffix(path, ".json") { 136 files = append(files, path) 137 } 138 139 return nil 140 }) 141 142 return files, err 143 }